From 2d0bd8effa72ab31142370fb16a063ab295e8f88 Mon Sep 17 00:00:00 2001 From: anolis-bot Date: Thu, 31 Mar 2022 18:21:58 +0800 Subject: [PATCH] import qemu-kvm-4.2.0-59.module+el8.5.0+14169+68d2f392.2.src.rpm --- 0005-Initial-redhat-build.patch | 167 + 0006-Enable-disable-devices-for-RHEL.patch | 994 ++ ...Machine-type-related-general-changes.patch | 675 + 0008-Add-aarch64-machine-types.patch | 276 + 0009-Add-ppc64-machine-types.patch | 463 + 0010-Add-s390x-machine-types.patch | 126 + 0011-Add-x86_64-machine-types.patch | 897 + 0012-Enable-make-check.patch | 307 + ...mber-of-devices-that-can-be-assigned.patch | 114 + ...Add-support-statement-to-help-output.patch | 58 + ...lly-limit-the-maximum-number-of-CPUs.patch | 152 + 0016-Add-support-for-simpletrace.patch | 121 + ...documentation-instead-of-qemu-system.patch | 118 + 0018-usb-xhci-Fix-PCI-capability-order.patch | 96 + ...ct-scsi-cd-if-data-plane-enabled-RHE.patch | 69 + ...e-at-least-64kiB-pages-for-downstrea.patch | 60 + ...er-m_free-might-read-pointers-from-a.patch | 61 + 81-kvm-rhel.rules | 1 + 85-kvm.preset | 5 + 95-kvm-memlock.conf | 10 + 99-qemu-guest-agent.rules | 2 + README.md | 11 - README.tests | 39 + bridge.conf | 1 + ksm.service | 13 + ksm.sysconfig | 4 + ksmctl.c | 77 + ksmtuned | 139 + ksmtuned.conf | 21 + ksmtuned.service | 12 + ...pected-files-for-HMAT-tests-acpihmat.patch | 41 + kvm-Add-mtod_check.patch | 68 + kvm-Compress-lines-for-immediate-return.patch | 242 + ...-leak-memory-when-reallocation-fails.patch | 58 + kvm-Drop-bogus-IPv6-messages.patch | 51 + kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch | 59 + ...-afte-free-in-ip_reass-CVE-2020-1983.patch | 60 + ...qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch | 55 + ...able-nested-PERF_GLOBAL_CTRL-MSR-sup.patch | 53 + ...ate-dirty_bmap-when-we-change-a-slot.patch | 115 + ...remaining-malloc-free-user-with-glib.patch | 118 + kvm-Revert-RHEL-disable-hostmem-memfd.patch | 58 + ...n-t-let-an-operation-wait-for-itself.patch | 121 + ...sd-fix-memory-leak-on-fuse-queueinfo.patch | 63 + ...-and-word-access-to-core-ACPI-regist.patch | 82 + ...ompletely-stop-polling-when-disabled.patch | 104 + ...e-polling-of-main-AioContext-if-BQL-.patch | 132 + ...it-APIC-ID-for-migration-instance-ID.patch | 62 + kvm-async-use-explicit-memory-barriers.patch | 183 + ...eric_get_buffer_in-should-honor-size.patch | 53 + ...ove-error-for-bdrv_getlength-failure.patch | 51 + ...re-that-source-and-target-size-match.patch | 124 + ...-acquire-aio_context-in-backup_clean.patch | 57 + kvm-backup-top-Begin-drain-earlier.patch | 56 + ...ecursively-even-for-already-active-n.patch | 116 + ...lags-to-BlockDriver.bdrv_co_truncate.patch | 283 + ...lock-Add-flags-to-bdrv-_co-_truncate.patch | 353 + ...tion-to-truncation-of-long-NBD-expor.patch | 105 + ..._flight-during-blk_wait_while_draine.patch | 84 + ...x-cross-AioContext-blockdev-snapshot.patch | 91 + ...ix-leak-in-bdrv_create_file_fallback.patch | 60 + ...block-Generic-file-creation-fallback.patch | 227 + ...B.in_flight-for-coroutine-and-sync-i.patch | 295 + ...troduce-bdrv_reopen_commit_post-step.patch | 65 + ...Make-bdrv_get_cumulative_perm-public.patch | 67 + ...sier-to-learn-which-BDS-support-bitm.patch | 145 + ...x-restrictions-for-blockdev-snapshot.patch | 117 + ...igned-image-size-to-avoid-assertion-.patch | 77 + ...x-blockdev-reopen-API-with-feature-f.patch | 57 + ...l-entire-LUKS-header-space-with-zero.patch | 308 + ...ck-backend-Add-flags-to-blk_truncate.patch | 294 + ...order-flush-pdiscard-function-defini.patch | 158 + ...-Don-t-acquire-context-while-droppin.patch | 130 + ...n-with-backing-file-in-different-Aio.patch | 114 + ...header-field-names-are-case-insensit.patch | 55 + ...header-fields-allow-whitespace-aroun.patch | 76 + ...-Fix-problem-with-fallocate-PUNCH_HO.patch | 76 + ...ducing-bdrv_co_delete_file-interface.patch | 99 + ...heap-buffer-overflow-in-iscsi_aio_io.patch | 100 + kvm-block-nbd-Fix-hang-in-.bdrv_close.patch | 78 + ...Driver-reference-to-the-.bdrv_co_cre.patch | 328 + ...-bitmap-reopen-into-bdrv_reopen_comm.patch | 78 + ...wn-the-fallback-image-creation-funct.patch | 296 + ...Don-t-make-backing-file-data-visible.patch | 94 + kvm-block.c-adding-bdrv_co_delete_file.patch | 92 + ...-AioContext-on-dirty-bitmap-function.patch | 176 + ...-several-bitmap-functions-to-non-sta.patch | 179 + ...bs-to-the-proper-context-on-snapshot.patch | 107 + ...ff-basic-bitmap-operations-for-qemu-.patch | 720 + ...ing-style-issues-in-drive_backup_pre.patch | 62 + ...drv_try_set_aio_context-context-requ.patch | 204 + ...mp_blockdev_backup-and-blockdev-back.patch | 144 + ...mp_drive_backup-and-drive-backup-tra.patch | 419 + kvm-bootp-check-bootp_input-buffer-size.patch | 52 + ...or-specific-area-to-input-packet-mem.patch | 175 + ...me-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch | 137 + ...do-not-make-qemu-ga-link-with-pixman.patch | 2463 +++ ...ch-to-use-qemu_receive_packet-for-lo.patch | 60 + ...mpat-disable-edid-for-virtio-gpu-ccw.patch | 50 + kvm-config-enable-VFIO_CCW.patch | 39 + ...ost-user-Protect-slave-fd-with-mutex.patch | 134 + ...-created-file-when-block_crypto_co_c.patch | 98 + ...tures-Make-kvm-no-adjvtime-comment-c.patch | 56 + ...o-use-qemu_receive_packet-for-loopba.patch | 53 + ...e1000-fail-early-for-evil-descriptor.patch | 65 + kvm-e1000-fix-tx-re-entrancy-problem.patch | 71 + ...use-qemu_receive_packet-for-loopback.patch | 52 + kvm-enable-ramfb.patch | 72 + ...error-Document-Error-API-usage-rules.patch | 154 + ...ix-examples-in-error.h-s-big-comment.patch | 85 + kvm-error-Improve-error.h-s-big-comment.patch | 146 + kvm-error-New-macro-ERRP_GUARD.patch | 305 + ...-Free-rom-data-during-inmigrate-skip.patch | 85 + ...Allow-byte-aligned-O_DIRECT-with-NFS.patch | 96 + kvm-file-posix-Drop-hdev_co_create_opts.patch | 131 + ...Handle-EINVAL-fallocate-return-value.patch | 59 + ...ate-file-fragmentation-with-extent-s.patch | 466 + ...ort-BDRV_REQ_ZERO_WRITE-for-truncate.patch | 48 + ...pat-add-g_unix_get_passwd_entry_qemu.patch | 89 + ...Memory-Proximity-Domain-Attributes-S.patch | 275 + ...Memory-Side-Cache-Information-Struct.patch | 137 + ...System-Locality-Latency-and-Bandwidt.patch | 173 + ...ow-using-qdev-ID-for-qemu-io-command.patch | 100 + ...oduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch | 222 + ...rm-smmu-Introduce-smmu_get_iotlb_key.patch | 166 + ...hw-arm-smmu-common-Add-IOTLB-helpers.patch | 181 + ...on-Factorize-some-code-in-smmu_ptw_6.patch | 124 + ...mu-common-Manage-IOTLB-block-entries.patch | 274 + ...x-potential-integer-overflow-CID-143.patch | 67 + ...-Get-prepared-for-range-invalidation.patch | 255 + ...troduce-smmuv3_s1_range_inval-helper.patch | 115 + ...check-return-value-of-usb_packet_map.patch | 61 + ...Fix-interrupt-ID-in-GICD_SGIR-regist.patch | 80 + ...vance-desc_offset-in-case-of-null-de.patch | 62 + ...t-fix-assertion-failure-in-net_tx_pk.patch | 56 + ...id-hot-plug-if-it-s-disabled-on-the-.patch | 77 + ...-hot-plug-capability-check-to-pre_pl.patch | 90 + ...ace-PCI_DEVICE-casts-with-existing-v.patch | 62 + ...k-MODE_PAGE_ALLS-not-allowed-in-MODE.patch | 61 + ...w-default-SMBIOS-fields-for-Windows-.patch | 262 + ...d-2nd-Generation-AMD-EPYC-processors.patch | 199 + kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch | 46 + kvm-i386-Add-macro-for-stibp.patch | 49 + kvm-i386-Add-new-CPU-model-Cooperlake.patch | 108 + ...port-for-AMD-EPYC-3rd-generation-pro.patch | 213 + ...M-features-if-nested-SVM-is-disabled.patch | 82 + kvm-i386-Remove-cpu64-rhel6-CPU-model.patch | 77 + ...-Resolve-CPU-models-to-v1-by-default.patch | 95 + ...logical-block-address-and-read-size-.patch | 120 + ...-page-walking-on-device-iotlb-invali.patch | 58 + ...roduce-kvm_kernel_irqchip_-functions.patch | 281 + ...6-Move-v3-exclusive-test-to-new-file.patch | 241 + ...est-EIO-on-allocation-in-a-data-file.patch | 112 + ...est-EIO-on-preallocated-zero-cluster.patch | 102 + ...5-refactor-compressed-backup-to-vmdk.patch | 176 + ...-vmdk-target-tests-if-vmdk-is-not-wh.patch | 45 + ...09-Don-t-mirror-with-mismatched-size.patch | 387 + ...-229-Use-blkdebug-to-inject-an-error.patch | 120 + kvm-iotests-Add-iothread-cases-to-155.patch | 147 + ...-skip_if_unsupported-statements-to-t.patch | 236 + kvm-iotests-Add-qemu_io_log.patch | 48 + ...-291-to-for-qemu-img-bitmap-coverage.patch | 253 + ...Add-test-for-image-creation-fallback.patch | 138 + ...up-with-different-source-target-size.patch | 105 + kvm-iotests-Create-VM.blockdev_create.patch | 59 + ...ter-testfiles-out-in-filter_img_info.patch | 52 + ...tests-Fix-run_job-with-use_log-False.patch | 47 + kvm-iotests-Fix-test-178.patch | 59 + ...-_make_test_img-parse-its-parameters.patch | 91 + ...or-with-different-source-target-size.patch | 110 + ...r-blockdev-reopen-test-for-iothreads.patch | 122 + ...ests-Support-job-complete-in-run_job.patch | 46 + ...est-committing-to-short-backing-file.patch | 480 + ...Test-external-snapshot-with-VM-state.patch | 189 + ...dling-of-AioContexts-with-some-block.patch | 322 + ...ror-with-temporarily-disabled-target.patch | 162 + ...iotests-Use-complete_and_wait-in-155.patch | 50 + ...tests-don-t-use-format-for-drive_add.patch | 81 + ...py-Let-wait_migration-wait-even-more.patch | 123 + ...count-from-GET-LBA-STATUS-CVE-2020-1.patch | 79 + kvm-iscsi-Drop-iscsi_co_create_opts.patch | 113 + ...b-s-lock-individually-in-job_txn_app.patch | 213 + ...o-use-qemu_receive_packet-for-loopba.patch | 53 + ...-pc-use-32-bit-write-for-EJ-register.patch | 47 + ...hci-use-32-bit-write-for-config-regi.patch | 48 + ...t-user-Fix-some-memtable-remap-cases.patch | 117 + ...ndle-endianness-as-mandated-by-the-s.patch | 290 + ...x-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch | 43 + ...Partial-update-against-Linux-5.9-rc4.patch | 83 + ...rs-add-vfio-DMA-available-capability.patch | 54 + ...ux-headers-support-vfio-ccw-features.patch | 77 + kvm-linux-headers-update-kvm.h.patch | 119 + kvm-memory-Add-IOMMUTLBEvent.patch | 590 + ..._NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch | 89 + ...mory_region_notify_one-to-memory_reg.patch | 146 + ...mory-accept-mismatching-sizes-in-mem.patch | 104 + ...range-assertion-if-notifier-is-DEVIO.patch | 70 + ...hed-translation-in-case-it-points-to.patch | 87 + ...-SaveStateEntry.instance_id-into-uin.patch | 179 + ...igration-Create-migration_is_running.patch | 119 + ...ation-Define-VMSTATE_INSTANCE_ID_ANY.patch | 257 + ...n-Don-t-send-data-if-we-have-stopped.patch | 42 + ...ure-that-we-don-t-call-write-in-case.patch | 94 + ...VM-is-paused-when-migration-is-cance.patch | 70 + ...gration-Rate-limit-inside-host-pages.patch | 172 + ...ifd-clean-pages-after-filling-packet.patch | 65 + ...d-fix-destroyed-mutex-access-in-term.patch | 77 + ...d-fix-nullptr-access-in-multifd_send.patch | 75 + ...d-fix-nullptr-access-in-terminating-.patch | 68 + ...n-t-let-an-operation-wait-for-itself.patch | 123 + ...re-that-source-and-target-size-match.patch | 89 + ...-Store-MirrorOp.co-for-debuggability.patch | 51 + ...r-Wait-only-for-in-flight-operations.patch | 95 + ...o-length-arrays-with-flexible-array-.patch | 255 + ...e-that-we-don-t-do-any-IO-after-an-e.patch | 74 + ...-long-error-message-assertions-CVE-2.patch | 161 + ...-file-descriptor-is-valid-before-usi.patch | 301 + ...s-from-probing-vnet-hdr-flag-for-TAP.patch | 221 + kvm-net-forbid-the-reentrant-RX.patch | 50 + kvm-net-introduce-qemu_receive_packet.patch | 187 + ...e-an-assert-call-in-eth_get_gso_type.patch | 59 + ...to-provide-initiator-information-for.patch | 318 + ...to-provide-memory-latency-and-bandwi.patch | 545 + ...to-provide-memory-side-cache-informa.patch | 326 + ...-properly-check-if-numa-is-supported.patch | 81 + kvm-numa-remove-not-needed-check.patch | 59 + ...if-NVRAM-cannot-contain-all-prom-env.patch | 250 + ...-Allow-booting-in-case-the-first-vir.patch | 112 + ...-Do-not-bail-out-early-if-not-findin.patch | 214 + ...-Introduce-ENODEV-define-and-remove-.patch | 54 + ...-Makefile-Compile-with-std-gnu99-fwr.patch | 60 + ...-Move-ipl-related-code-from-main-int.patch | 72 + ...-Move-the-inner-logic-of-find_subch-.patch | 154 + ...-Scan-through-all-devices-if-no-boot.patch | 116 + ...-break-loop-if-a-null-block-number-i.patch | 50 + ...-don-t-try-to-read-the-next-block-if.patch | 48 + ...c-bios-s390-ccw-fix-off-by-one-error.patch | 51 + ...-main-Remove-superfluous-call-to-ena.patch | 43 + ...90x-Clear-out-leftover-S390EP-string.patch | 87 + ...390x-Ensure-Read-IPL-memory-is-clean.patch | 63 + kvm-pc-bios-s390x-Fix-reset-psw-mask.patch | 75 + ...Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch | 45 + ...-s390x-Save-iplb-location-in-lowcore.patch | 145 + ...e-PSW-masks-where-possible-and-intro.patch | 89 + ...ot_port-Add-hotplug-disabling-option.patch | 153 + ...use-qemu_receive_packet-for-loopback.patch | 54 + ...-external-interrupt-pin-in-KVM-on-re.patch | 107 + ...UPPCState-irq_input_state-with-moder.patch | 112 + ...tremovable-flag-on-DIMM-LMBs-on-drme.patch | 82 + ...ert-IRQs-during-event-scan-if-there-.patch | 67 + ...rite-only-overlay-feature-for-blockd.patch | 64 + ...ble-use-of-g_autoptr-with-QAPI-types.patch | 237 + ...2-Expose-bitmaps-size-during-measure.patch | 495 + ...cluster_abort-for-pre-existing-clust.patch | 47 + ...alloc_cluster_abort-for-external-dat.patch | 52 + ...RO_WRITE-flag-for-full-preallocation.patch | 98 + ...ort-BDRV_REQ_ZERO_WRITE-for-truncate.patch | 101 + ...qemu-file-Don-t-do-IO-after-shutdown.patch | 92 + kvm-qemu-img-Add-bitmap-sub-command.patch | 398 + kvm-qemu-img-Add-convert-bitmaps-option.patch | 244 + ...-Factor-out-code-for-merging-bitmaps.patch | 89 + ...mu-img-convert-Don-t-pre-zero-images.patch | 73 + ...d-cvtnum_full-to-print-error-reports.patch | 241 + ..._host_name-instead-of-g_get_host_nam.patch | 73 + kvm-qga-add-command-guest-get-disks.patch | 115 + ...ntation-of-guest-get-disks-for-Linux.patch | 427 + ...tation-of-guest-get-disks-for-Window.patch | 181 + ...-argument-to-ssh-add-authorized-keys.patch | 176 + ...a-add-ssh-add-remove-authorized-keys.patch | 525 + kvm-qga-add-ssh-get-authorized-keys.patch | 170 + ...ix-Move-the-udev-code-from-the-pci-t.patch | 140 + ...ix-Rework-build_guest_fsinfo_for_rea.patch | 156 + ...ix-Support-fsinfo-for-non-PCI-virtio.patch | 94 + ...-assert-regression-on-guest-shutdown.patch | 61 + ...sing-closedir-in-qmp_guest_get_disks.patch | 54 + ...-Error-parameter-to-more-common-errp.patch | 121 + ...a-for-guest-get-disks-dependents-fie.patch | 113 + ...ort-BDRV_REQ_ZERO_WRITE-for-truncate.patch | 55 + ...qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch | 72 + ...rt-we-own-context-before-job_cancel_.patch | 57 + ...o-use-qemu_receive_packet-for-loopba.patch | 54 + ...390-guest-support-for-diagnose-0x318.patch | 282 + ...-by-one-in-update_machine_ipl_proper.patch | 54 + kvm-s390-ipl-sync-back-loadparm.patch | 91 + ...g318-propagation-and-reset-functiona.patch | 163 + ...tended-length-sccb-support-for-kvm-g.patch | 220 + ...heck-sccb-len-before-filling-in-data.patch | 106 + ...achine-once-during-read-scp-cpu-info.patch | 75 + ...-sclp-improve-special-wait-psw-logic.patch | 52 + ...ccb-from-mem-based-on-provided-lengt.patch | 170 + ...390-sclp-rework-sclp-boundary-checks.patch | 80 + ...use-cpu-offset-to-locate-cpu-entries.patch | 67 + kvm-s390x-Add-SIDA-memory-ops.patch | 150 + ...90x-Add-missing-vcpu-reset-functions.patch | 176 + ...x-Add-unpack-facility-feature-to-GA1.patch | 76 + kvm-s390x-Beautify-diag308-handling.patch | 130 + ...do-a-normal-reset-on-the-initial-cpu.patch | 52 + ...90x-Fix-cpu-normal-reset-ri-clearing.patch | 101 + kvm-s390x-Move-clear-reset.patch | 146 + ...nose-308-subcodes-and-rcs-into-ipl.h.patch | 83 + kvm-s390x-Move-initial-reset.patch | 159 + ...reset-normal-to-shared-reset-handler.patch | 145 + ...etch-and-test-the-short-psw-on-diag3.patch | 70 + ...-use-constants-for-short-PSW-address.patch | 87 + ...s-Refactor-the-css_queue_crw-routine.patch | 119 + ...ix-build-for-without-default-devices.patch | 74 + ...idate-iplb-validity-check-into-one-f.patch | 82 + ...-kvm-Make-kvm_sclp_service_call-void.patch | 83 + ...utine-to-get-the-vfio-dma-available-.patch | 150 + ...90x-pci-Honor-DMA-limits-set-by-vfio.patch | 357 + ...ove-header-files-to-include-hw-s390x.patch | 110 + ...s390x-protvirt-Add-migration-blocker.patch | 79 + ...isable-address-checks-for-PV-guest-I.patch | 135 + ...ix-stray-error_report_err-in-s390_ma.patch | 55 + ...t-Handle-SIGP-store-status-correctly.patch | 61 + ...nhibit-balloon-when-switching-to-pro.patch | 104 + ...s390x-protvirt-KVM-intercept-changes.patch | 75 + ...Move-IO-control-structures-over-SIDA.patch | 171 + ...x-protvirt-Move-STSI-data-over-SIDAD.patch | 70 + ...rotvirt-Move-diag-308-data-over-SIDA.patch | 93 + kvm-s390x-protvirt-SCLP-interpretation.patch | 172 + kvm-s390x-protvirt-Set-guest-IPL-PSW.patch | 75 + ...90x-protvirt-Support-unpack-facility.patch | 886 + ...llow-to-IPL-secure-guests-with-no-re.patch | 61 + ...M_PV_PREP_RESET-command-wrapper-name.patch | 92 + kvm-s390x-pv-Fix-diag318-PV-fencing.patch | 114 + ...s390x-pv-Remove-sclp-boundary-checks.patch | 57 + kvm-s390x-pv-Retry-ioctls-on-EINTR.patch | 57 + ...o-ccw-Fix-build-on-systems-without-K.patch | 150 + ...o-ccw-Reset-PCI-devices-during-subsy.patch | 52 + ...ove-unneeded-label-in-sclp_service_c.patch | 90 + ...90x-sigp-Fix-sense-running-reporting.patch | 49 + ...ear-local-interrupts-on-reset-normal.patch | 57 + kvm-s390x.conf | 19 + kvm-scsi-make-io_timeout-configurable.patch | 177 + ...ing-of-whole-process-instead-of-thre.patch | 79 + kvm-setup | 49 + kvm-setup.service | 14 + ...t_len-before-reading-protocol-header.patch | 72 + ...ct-size-while-emulating-IRC-commands.patch | 77 + ...orrect-size-while-emulating-commands.patch | 71 + ...u-memory-Log-invalid-memory-accesses.patch | 84 + ...-Adjust-firmware-path-of-PCI-devices.patch | 205 + ...llow-memory-unplug-to-always-succeed.patch | 101 + ...ger-a-CAS-reboot-for-XICS-XIVE-mode-.patch | 113 + ...n-t-use-spapr_drc_needed-in-CAS-code.patch | 145 + ....3-accelerated-count-cache-flush-in-.patch | 135 + ...pability-issue-on-KVM-guest-for-PCI-.patch | 165 + ...ng-of-unplugged-devices-during-CAS-a.patch | 105 + ..._compose_response-into-h_client_arch.patch | 246 + ...-Improve-handling-of-fdt-buffer-size.patch | 125 + ...ndling-of-memory-unplug-with-old-gue.patch | 170 + ...aximum-number-of-vCPUs-to-the-KVM-in.patch | 213 + ...le-comment-about-power-saving-LPCR-b.patch | 50 + ...o-current-AIL-mode-when-starting-a-n.patch | 89 + ...-use-qemu_receive_packet-for-loopbac.patch | 54 + kvm-target-arm-Fix-PAuth-sbox-functions.patch | 65 + kvm-target-arm-arch_dump-Add-SVE-notes.patch | 298 + ...Add-the-kvm-no-adjvtime-CPU-property.patch | 281 + ...vm-Implement-virtual-time-adjustment.patch | 330 + ...rivial-Clean-up-header-documentation.patch | 197 + ...vm64-kvm64-cpus-have-timer-registers.patch | 60 + ...or-query-cpu-model-expansion-crashed.patch | 81 + ...ARCH_CAPABILITIES-related-bits-into-.patch | 83 + ...missed-features-to-Cooperlake-CPU-mo.patch | 103 + ...new-bit-definitions-of-MSR_IA32_ARCH.patch | 62 + ...target-i386-add-a-ucode-rev-property.patch | 125 + ...-i386-add-fast-short-REP-MOV-support.patch | 59 + ...k-for-availability-of-MSR_IA32_UCODE.patch | 72 + ...ot-set-unsupported-VMX-secondary-exe.patch | 112 + ...le-monitor-and-ucode-revision-with-c.patch | 49 + ...target-i386-fix-TCG-UCODE_REV-access.patch | 73 + ...m-initialize-feature-MSRs-very-early.patch | 178 + ...initialize-microcode-revision-from-K.patch | 64 + ...the-CPUID-level-to-0x14-on-old-machi.patch | 69 + ...fail-query-sev-capabilities-if-QEMU-.patch | 56 + ...provide-proper-error-reporting-for-q.patch | 142 + ...-Enable-adapter-interruption-suppres.patch | 60 + kvm-tcp_emu-Fix-oob-access.patch | 59 + kvm-tcp_emu-fix-unsafe-snprintf-usages.patch | 149 + ...eatures-Check-feature-default-values.patch | 106 + ...es-test-add-test-cases-for-ACPI-HMAT.patch | 127 + ...tor-Fix-the-bad-s390x-assembler-code.patch | 60 + ...sts-numa-Add-case-for-QMP-build-HMAT.patch | 266 + kvm-tftp-check-tftp_input-buffer-size.patch | 53 + kvm-tftp-introduce-a-header-structure.patch | 263 + ...fuse_lowlevel-Fix-fuse_out_header-er.patch | 55 + ...ofsd-passthrough_ll-Fix-double-close.patch | 56 + kvm-tpm-ppi-page-align-PPI-RAM.patch | 58 + ...e-update-qemu-trace-stap-to-Python-3.patch | 82 + ...DT_V2-to-work-around-symbol-visibili.patch | 116 + ...-use-qemu_receive_packet_iov-for-loo.patch | 53 + kvm-udp-check-upd_input-buffer-size.patch | 52 + kvm-upd6-check-udp6_input-buffer-size.patch | 52 + ...sb-fix-setup_len-init-CVE-2020-14364.patch | 102 + ...-Prevent-recursion-in-usbredir_write.patch | 106 + kvm-util-Introduce-qemu_get_host_name.patch | 123 + kvm-util-add-slirp_fmt-helpers.patch | 140 + ...ed-routine-for-scanning-info-capabil.patch | 79 + kvm-vfio-Find-DMA-available-capability.patch | 91 + ...d-support-for-the-CRW-region-and-IRQ.patch | 175 + ...ccw-Add-support-for-the-schib-region.patch | 254 + ...-Connect-the-device-request-notifier.patch | 128 + kvm-vfio-ccw-Fix-error-message.patch | 48 + kvm-vfio-ccw-Refactor-ccw-irq-handler.patch | 155 + ...vfio-ccw-Refactor-cleanup-of-regions.patch | 73 + kvm-vfio-ccw-allow-non-prefetch-ORBs.patch | 61 + ...ve-exec-permission-to-avoid-SELinux-.patch | 75 + ...emove-irqchip-notifier-if-not-regist.patch | 58 + ...Add-names-to-section-rounded-warning.patch | 53 + ...t-Only-align-sections-for-vhost-user.patch | 97 + kvm-vhost-coding-style-fix.patch | 56 + ...ctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch | 69 + ...Print-unexpected-slave-message-types.patch | 48 + ...host-user-fs-remove-vhostfd-property.patch | 59 + ...st-user-gpu-Drop-trailing-json-comma.patch | 52 + ...ility-to-delete-vq-through-a-pointer.patch | 80 + kvm-virtio-add-vhost-user-fs-ccw-device.patch | 136 + ...start-process-queued-requests-in-the.patch | 203 + ...tor-the-code-that-processes-queued-r.patch | 83 + ...-enable-notifications-during-polling.patch | 158 + ...io-fs-fix-MSI-X-nvectors-calculation.patch | 60 + ...-make-virtio_delete_queue-idempotent.patch | 42 + ...e-also-control-queue-when-TX-RX-dele.patch | 49 + ...o-net-fix-removal-of-failover-device.patch | 52 + ...-region-cache-when-on-queue-deletion.patch | 46 + kvm-virtiofs-Add-maintainers-entry.patch | 52 + ...-to-the-log-with-FUSE_LOG_DEBUG-leve.patch | 86 + ...akefile-wiring-for-virtiofsd-contrib.patch | 106 + kvm-virtiofsd-Add-auxiliary-.c-s.patch | 1387 ++ kvm-virtiofsd-Add-fuse_lowlevel.c.patch | 3172 ++++ kvm-virtiofsd-Add-main-virtio-loop.patch | 105 + kvm-virtiofsd-Add-options-for-virtio.patch | 103 + kvm-virtiofsd-Add-passthrough_ll.patch | 1387 ++ ...mestamp-to-the-log-with-FUSE_LOG_DEB.patch | 73 + ...virtiofsd-Clean-up-inodes-on-destroy.patch | 85 + ...t-lo_destroy-to-take-the-lo-mutex-lo.patch | 112 + ...isable-remote-posix-locks-by-default.patch | 72 + ...op-CAP_FSETID-if-client-asked-for-it.patch | 176 + ...embership-of-all-supplementary-group.patch | 111 + kvm-virtiofsd-Fast-path-for-virtio-read.patch | 240 + ...mmon-header-and-define-for-QEMU-buil.patch | 164 + ...ta-corruption-with-O_APPEND-write-in.patch | 136 + ...fuse_daemonize-ignored-return-values.patch | 120 + ...d-Fix-the-help-message-of-posix-lock.patch | 51 + kvm-virtiofsd-Fix-xattr-operations.patch | 327 + ...-Format-imported-files-to-qemu-style.patch | 14743 ++++++++++++++++ kvm-virtiofsd-Handle-hard-reboot.patch | 65 + kvm-virtiofsd-Handle-reinit.patch | 53 + kvm-virtiofsd-Keep-track-of-replies.patch | 116 + ...Kill-threads-when-queues-are-stopped.patch | 143 + ...sync-work-even-if-only-inode-is-pass.patch | 96 + ...vhost-connection-instead-of-mounting.patch | 257 + ...ofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch | 76 + ...Pass-write-iov-s-all-the-way-through.patch | 140 + ...-fuse_bufvec-through-to-do_write_buf.patch | 168 + kvm-virtiofsd-Poll-kick_fd-for-queue.patch | 97 + ...t-multiply-running-with-same-vhost_u.patch | 144 + kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch | 945 + kvm-virtiofsd-Pull-in-upstream-headers.patch | 4911 +++++ kvm-virtiofsd-Remove-fuse_req_getgroups.patch | 193 + ...move-unused-enum-fuse_buf_copy_flags.patch | 271 + ...Reset-O_DIRECT-flag-during-file-open.patch | 72 + kvm-virtiofsd-Send-replies-to-messages.patch | 199 + kvm-virtiofsd-Show-submounts.patch | 51 + kvm-virtiofsd-Start-queue-threads.patch | 165 + ...sd-Start-reading-commands-from-queue.patch | 200 + ...virtiofsd-Start-wiring-up-vhost-user.patch | 247 + ...virtiofsd-Support-remote-posix-locks.patch | 355 + kvm-virtiofsd-Trim-down-imported-files.patch | 1582 ++ ...irtiofsd-Trim-out-compatibility-code.patch | 545 + kvm-virtiofsd-Whitelist-fchmod.patch | 79 + ...sd-add-definition-of-fuse_buf_writev.patch | 93 + ...iofsd-add-fd-FDNUM-fd-passing-option.patch | 170 + kvm-virtiofsd-add-fuse_mbuf_iter-API.patch | 134 + ...iofsd-add-helper-for-lo_data-cleanup.patch | 88 + ...fsd-add-o-source-PATH-to-help-output.patch | 46 + ...tiofsd-add-print-capabilities-option.patch | 121 + ...rtiofsd-add-rlimit-nofile-NUM-option.patch | 164 + kvm-virtiofsd-add-seccomp-whitelist.patch | 285 + ...add-some-options-to-the-help-message.patch | 74 + ...iofsd-add-syslog-command-line-option.patch | 239 + ...ofsd-add-thread-pool-size-NUM-option.patch | 106 + kvm-virtiofsd-add-vhost-user.json-file.patch | 73 + kvm-virtiofsd-cap-ng-helpers.patch | 175 + ...input-buffer-size-in-fuse_lowlevel.c.patch | 1111 ++ ...fsd-cleanup-allocated-resource-in-se.patch | 82 + ...t-more-fprintf-and-perror-to-use-fus.patch | 99 + ...d-do-not-always-set-FUSE_FLOCK_LOCKS.patch | 57 + ...virtiofsd-do_read-missing-NULL-check.patch | 49 + ...ll-capabilities-in-the-wait-parent-p.patch | 67 + ...d-enable-PARALLEL_DIROPS-during-INIT.patch | 47 + ...ofsd-extract-lo_do_open-from-lo_open.patch | 167 + ...ract-root-inode-init-into-setup_root.patch | 111 + ...hen-parent-inode-isn-t-known-in-lo_d.patch | 85 + ...virtiofsd-fix-error-handling-in-main.patch | 63 + ...correct-error-handling-in-lo_do_look.patch | 44 + ...tiofsd-fix-libfuse-information-leaks.patch | 322 + ...tiofsd-fix-lo_destroy-resource-leaks.patch | 94 + ...rtiofsd-fix-memory-leak-on-lo.source.patch | 66 + ...ate_listen_socket-error-path-socket-.patch | 56 + ...virtiofsd-get-set-features-callbacks.patch | 66 + ...uce-inode-refcount-to-prevent-use-af.patch | 589 + kvm-virtiofsd-jail-lo-proc_self_fd.patch | 85 + kvm-virtiofsd-load_capng-missing-unlock.patch | 46 + ...tiofsd-make-f-foreground-the-default.patch | 76 + kvm-virtiofsd-make-lo_release-atomic.patch | 62 + ...irtiofsd-move-to-a-new-pid-namespace.patch | 223 + ...d-move-to-an-empty-network-namespace.patch | 66 + ...only-retain-file-system-capabilities.patch | 112 + ...ally-return-inode-pointer-from-lo_do.patch | 124 + ...rough_ll-Pass-errno-to-fuse_reply_er.patch | 54 + ...rough_ll-Use-cache_readdir-for-direc.patch | 48 + ...rough_ll-add-dirp_map-to-hide-lo_dir.patch | 238 + ...through_ll-add-fallback-for-racy-ops.patch | 303 + ...rough_ll-add-fd_map-to-hide-file-des.patch | 328 + ...rough_ll-add-ino_map-to-hide-lo_inod.patch | 395 + ...rough_ll-add-lo_map-for-ino-fh-indir.patch | 182 + ...passthrough_ll-add-renameat2-support.patch | 52 + ...rough_ll-clean-up-cache-related-opti.patch | 138 + ...hrough_ll-cleanup-getxattr-listxattr.patch | 154 + ...d-passthrough_ll-control-readdirplus.patch | 79 + ...rough_ll-create-new-files-in-caller-.patch | 198 + ...rough_ll-disable-readdirplus-on-cach.patch | 50 + ...rough_ll-fix-refcounting-on-remove-r.patch | 143 + ...rtiofsd-passthrough_ll-use-hashtable.patch | 211 + ...fsd-prevent-.-escape-in-lo_do_lookup.patch | 54 + ...sd-prevent-.-escape-in-lo_do_readdir.patch | 108 + ...prevent-FUSE_INIT-FUSE_DESTROY-races.patch | 103 + ...t-fv_queue_thread-vs-virtio_loop-rac.patch | 149 + ...t-opening-of-special-files-CVE-2020-.patch | 314 + ...iofsd-prevent-races-with-lo_dirp_put.patch | 147 + ...log-only-when-priority-is-high-enoug.patch | 469 + ...sd-process-requests-in-a-thread-pool.patch | 533 + ...fsd-remove-mountpoint-dummy-argument.patch | 159 + ...d-remove-unused-notify-reply-support.patch | 294 + ...name-inode-refcount-to-inode-nlookup.patch | 139 + ...-unref_inode-to-unref_inode_lolocked.patch | 94 + kvm-virtiofsd-sandbox-mount-namespace.patch | 166 + ...ofsd-set-maximum-RLIMIT_NOFILE-limit.patch | 93 + ...elow-fs.file-max-sysctl-value-CVE-20.patch | 88 + ...ll-queue-threads-on-exit-in-virtio_l.patch | 72 + ...t-nanosecond-resolution-for-file-tim.patch | 83 + ...se_buf_writev-to-replace-fuse_buf_wr.patch | 82 + ...se_lowlevel_is_virtio-in-fuse_sessio.patch | 56 + ...-proc-self-fd-O_PATH-file-descriptor.patch | 390 + ...te-input-buffer-sizes-in-do_write_bu.patch | 137 + kvm-virtiofsd-validate-path-components.patch | 164 + ...-passthrough_ll-fix-fallocate-ifdefs.patch | 56 + ...able-AVX512_VP2INTERSECT-cpu-feature.patch | 63 + ...-cpu-Populate-SVM-CPUID-feature-bits.patch | 91 + kvm-x86.conf | 12 + ...ax_access_size-to-access-address-reg.patch | 76 + kvm-xhci-recheck-slot-status.patch | 77 + kvm-xics-Don-t-deassert-outputs.patch | 52 + kvm.conf | 3 + qemu-ga.sysconfig | 19 + qemu-guest-agent.service | 20 + qemu-kvm.spec | 4271 +++++ qemu-pr-helper.service | 15 + qemu-pr-helper.socket | 9 + udev-kvm-check.c | 155 + vhost.conf | 3 + 563 files changed, 108422 insertions(+), 11 deletions(-) create mode 100755 0005-Initial-redhat-build.patch create mode 100755 0006-Enable-disable-devices-for-RHEL.patch create mode 100755 0007-Machine-type-related-general-changes.patch create mode 100755 0008-Add-aarch64-machine-types.patch create mode 100755 0009-Add-ppc64-machine-types.patch create mode 100755 0010-Add-s390x-machine-types.patch create mode 100755 0011-Add-x86_64-machine-types.patch create mode 100755 0012-Enable-make-check.patch create mode 100755 0013-vfio-cap-number-of-devices-that-can-be-assigned.patch create mode 100755 0014-Add-support-statement-to-help-output.patch create mode 100755 0015-globally-limit-the-maximum-number-of-CPUs.patch create mode 100755 0016-Add-support-for-simpletrace.patch create mode 100755 0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch create mode 100755 0018-usb-xhci-Fix-PCI-capability-order.patch create mode 100755 0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch create mode 100755 0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch create mode 100755 0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch create mode 100755 81-kvm-rhel.rules create mode 100755 85-kvm.preset create mode 100755 95-kvm-memlock.conf create mode 100755 99-qemu-guest-agent.rules delete mode 100644 README.md create mode 100755 README.tests create mode 100755 bridge.conf create mode 100755 ksm.service create mode 100755 ksm.sysconfig create mode 100755 ksmctl.c create mode 100755 ksmtuned create mode 100755 ksmtuned.conf create mode 100755 ksmtuned.service create mode 100755 kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch create mode 100755 kvm-Add-mtod_check.patch create mode 100755 kvm-Compress-lines-for-immediate-return.patch create mode 100755 kvm-Don-t-leak-memory-when-reallocation-fails.patch create mode 100755 kvm-Drop-bogus-IPv6-messages.patch create mode 100755 kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch create mode 100755 kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch create mode 100755 kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch create mode 100755 kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch create mode 100755 kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch create mode 100755 kvm-Replace-remaining-malloc-free-user-with-glib.patch create mode 100755 kvm-Revert-RHEL-disable-hostmem-memfd.patch create mode 100755 kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch create mode 100755 kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch create mode 100755 kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch create mode 100755 kvm-aio-posix-completely-stop-polling-when-disabled.patch create mode 100755 kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch create mode 100755 kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch create mode 100755 kvm-async-use-explicit-memory-barriers.patch create mode 100755 kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch create mode 100755 kvm-backup-Improve-error-for-bdrv_getlength-failure.patch create mode 100755 kvm-backup-Make-sure-that-source-and-target-size-match.patch create mode 100755 kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch create mode 100755 kvm-backup-top-Begin-drain-earlier.patch create mode 100755 kvm-block-Activate-recursively-even-for-already-active-n.patch create mode 100755 kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch create mode 100755 kvm-block-Add-flags-to-bdrv-_co-_truncate.patch create mode 100755 kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch create mode 100755 kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch create mode 100755 kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch create mode 100755 kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch create mode 100755 kvm-block-Generic-file-creation-fallback.patch create mode 100755 kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch create mode 100755 kvm-block-Introduce-bdrv_reopen_commit_post-step.patch create mode 100755 kvm-block-Make-bdrv_get_cumulative_perm-public.patch create mode 100755 kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch create mode 100755 kvm-block-Relax-restrictions-for-blockdev-snapshot.patch create mode 100755 kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch create mode 100755 kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch create mode 100755 kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch create mode 100755 kvm-block-backend-Add-flags-to-blk_truncate.patch create mode 100755 kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch create mode 100755 kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch create mode 100755 kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch create mode 100755 kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch create mode 100755 kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch create mode 100755 kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch create mode 100755 kvm-block-introducing-bdrv_co_delete_file-interface.patch create mode 100755 kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch create mode 100755 kvm-block-nbd-Fix-hang-in-.bdrv_close.patch create mode 100755 kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch create mode 100755 kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch create mode 100755 kvm-block-trickle-down-the-fallback-image-creation-funct.patch create mode 100755 kvm-block-truncate-Don-t-make-backing-file-data-visible.patch create mode 100755 kvm-block.c-adding-bdrv_co_delete_file.patch create mode 100755 kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch create mode 100755 kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch create mode 100755 kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch create mode 100755 kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch create mode 100755 kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch create mode 100755 kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch create mode 100755 kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch create mode 100755 kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch create mode 100755 kvm-bootp-check-bootp_input-buffer-size.patch create mode 100755 kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch create mode 100755 kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch create mode 100755 kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch create mode 100755 kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch create mode 100755 kvm-compat-disable-edid-for-virtio-gpu-ccw.patch create mode 100755 kvm-config-enable-VFIO_CCW.patch create mode 100755 kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch create mode 100755 kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch create mode 100755 kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch create mode 100755 kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch create mode 100755 kvm-e1000-fail-early-for-evil-descriptor.patch create mode 100755 kvm-e1000-fix-tx-re-entrancy-problem.patch create mode 100755 kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch create mode 100755 kvm-enable-ramfb.patch create mode 100755 kvm-error-Document-Error-API-usage-rules.patch create mode 100755 kvm-error-Fix-examples-in-error.h-s-big-comment.patch create mode 100755 kvm-error-Improve-error.h-s-big-comment.patch create mode 100755 kvm-error-New-macro-ERRP_GUARD.patch create mode 100755 kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch create mode 100755 kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch create mode 100755 kvm-file-posix-Drop-hdev_co_create_opts.patch create mode 100755 kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch create mode 100755 kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch create mode 100755 kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch create mode 100755 kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch create mode 100755 kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch create mode 100755 kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch create mode 100755 kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch create mode 100755 kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch create mode 100755 kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch create mode 100755 kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch create mode 100755 kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch create mode 100755 kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch create mode 100755 kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch create mode 100755 kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch create mode 100755 kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch create mode 100755 kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch create mode 100755 kvm-hw-ehci-check-return-value-of-usb_packet_map.patch create mode 100755 kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch create mode 100755 kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch create mode 100755 kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch create mode 100755 kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch create mode 100755 kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch create mode 100755 kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch create mode 100755 kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch create mode 100755 kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch create mode 100755 kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch create mode 100755 kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch create mode 100755 kvm-i386-Add-macro-for-stibp.patch create mode 100755 kvm-i386-Add-new-CPU-model-Cooperlake.patch create mode 100755 kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch create mode 100755 kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch create mode 100755 kvm-i386-Remove-cpu64-rhel6-CPU-model.patch create mode 100755 kvm-i386-Resolve-CPU-models-to-v1-by-default.patch create mode 100755 kvm-ide-atapi-check-logical-block-address-and-read-size-.patch create mode 100755 kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch create mode 100755 kvm-introduce-kvm_kernel_irqchip_-functions.patch create mode 100755 kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch create mode 100755 kvm-iotests-026-Test-EIO-on-allocation-in-a-data-file.patch create mode 100755 kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch create mode 100755 kvm-iotests-055-refactor-compressed-backup-to-vmdk.patch create mode 100755 kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch create mode 100755 kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch create mode 100755 kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch create mode 100755 kvm-iotests-Add-iothread-cases-to-155.patch create mode 100755 kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch create mode 100755 kvm-iotests-Add-qemu_io_log.patch create mode 100755 kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch create mode 100755 kvm-iotests-Add-test-for-image-creation-fallback.patch create mode 100755 kvm-iotests-Backup-with-different-source-target-size.patch create mode 100755 kvm-iotests-Create-VM.blockdev_create.patch create mode 100755 kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch create mode 100755 kvm-iotests-Fix-run_job-with-use_log-False.patch create mode 100755 kvm-iotests-Fix-test-178.patch create mode 100755 kvm-iotests-Let-_make_test_img-parse-its-parameters.patch create mode 100755 kvm-iotests-Mirror-with-different-source-target-size.patch create mode 100755 kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch create mode 100755 kvm-iotests-Support-job-complete-in-run_job.patch create mode 100755 kvm-iotests-Test-committing-to-short-backing-file.patch create mode 100755 kvm-iotests-Test-external-snapshot-with-VM-state.patch create mode 100755 kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch create mode 100755 kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch create mode 100755 kvm-iotests-Use-complete_and_wait-in-155.patch create mode 100755 kvm-iotests-don-t-use-format-for-drive_add.patch create mode 100755 kvm-iotests.py-Let-wait_migration-wait-even-more.patch create mode 100755 kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch create mode 100755 kvm-iscsi-Drop-iscsi_co_create_opts.patch create mode 100755 kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch create mode 100755 kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch create mode 100755 kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch create mode 100755 kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch create mode 100755 kvm-libvhost-user-Fix-some-memtable-remap-cases.patch create mode 100755 kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch create mode 100755 kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch create mode 100755 kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch create mode 100755 kvm-linux-headers-add-vfio-DMA-available-capability.patch create mode 100755 kvm-linux-headers-support-vfio-ccw-features.patch create mode 100755 kvm-linux-headers-update-kvm.h.patch create mode 100755 kvm-memory-Add-IOMMUTLBEvent.patch create mode 100755 kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch create mode 100755 kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch create mode 100755 kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch create mode 100755 kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch create mode 100755 kvm-memory-clamp-cached-translation-in-case-it-points-to.patch create mode 100755 kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch create mode 100755 kvm-migration-Create-migration_is_running.patch create mode 100755 kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch create mode 100755 kvm-migration-Don-t-send-data-if-we-have-stopped.patch create mode 100755 kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch create mode 100755 kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch create mode 100755 kvm-migration-Rate-limit-inside-host-pages.patch create mode 100755 kvm-migration-multifd-clean-pages-after-filling-packet.patch create mode 100755 kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch create mode 100755 kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch create mode 100755 kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch create mode 100755 kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch create mode 100755 kvm-mirror-Make-sure-that-source-and-target-size-match.patch create mode 100755 kvm-mirror-Store-MirrorOp.co-for-debuggability.patch create mode 100755 kvm-mirror-Wait-only-for-in-flight-operations.patch create mode 100755 kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch create mode 100755 kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch create mode 100755 kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch create mode 100755 kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch create mode 100755 kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch create mode 100755 kvm-net-forbid-the-reentrant-RX.patch create mode 100755 kvm-net-introduce-qemu_receive_packet.patch create mode 100755 kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch create mode 100755 kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch create mode 100755 kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch create mode 100755 kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch create mode 100755 kvm-numa-properly-check-if-numa-is-supported.patch create mode 100755 kvm-numa-remove-not-needed-check.patch create mode 100755 kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch create mode 100755 kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch create mode 100755 kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch create mode 100755 kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch create mode 100755 kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch create mode 100755 kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch create mode 100755 kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch create mode 100755 kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch create mode 100755 kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch create mode 100755 kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch create mode 100755 kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch create mode 100755 kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch create mode 100755 kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch create mode 100755 kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch create mode 100755 kvm-pc-bios-s390x-Fix-reset-psw-mask.patch create mode 100755 kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch create mode 100755 kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch create mode 100755 kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch create mode 100755 kvm-pcie_root_port-Add-hotplug-disabling-option.patch create mode 100755 kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch create mode 100755 kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch create mode 100755 kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch create mode 100755 kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch create mode 100755 kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch create mode 100755 kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch create mode 100755 kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch create mode 100755 kvm-qcow2-Expose-bitmaps-size-during-measure.patch create mode 100755 kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch create mode 100755 kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch create mode 100755 kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch create mode 100755 kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch create mode 100755 kvm-qemu-file-Don-t-do-IO-after-shutdown.patch create mode 100755 kvm-qemu-img-Add-bitmap-sub-command.patch create mode 100755 kvm-qemu-img-Add-convert-bitmaps-option.patch create mode 100755 kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch create mode 100755 kvm-qemu-img-convert-Don-t-pre-zero-images.patch create mode 100755 kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch create mode 100755 kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch create mode 100755 kvm-qga-add-command-guest-get-disks.patch create mode 100755 kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch create mode 100755 kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch create mode 100755 kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch create mode 100755 kvm-qga-add-ssh-add-remove-authorized-keys.patch create mode 100755 kvm-qga-add-ssh-get-authorized-keys.patch create mode 100755 kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch create mode 100755 kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch create mode 100755 kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch create mode 100755 kvm-qga-fix-assert-regression-on-guest-shutdown.patch create mode 100755 kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch create mode 100755 kvm-qga-rename-Error-parameter-to-more-common-errp.patch create mode 100755 kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch create mode 100755 kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch create mode 100755 kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch create mode 100755 kvm-replication-assert-we-own-context-before-job_cancel_.patch create mode 100755 kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch create mode 100755 kvm-s390-guest-support-for-diagnose-0x318.patch create mode 100755 kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch create mode 100755 kvm-s390-ipl-sync-back-loadparm.patch create mode 100755 kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch create mode 100755 kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch create mode 100755 kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch create mode 100755 kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch create mode 100755 kvm-s390-sclp-improve-special-wait-psw-logic.patch create mode 100755 kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch create mode 100755 kvm-s390-sclp-rework-sclp-boundary-checks.patch create mode 100755 kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch create mode 100755 kvm-s390x-Add-SIDA-memory-ops.patch create mode 100755 kvm-s390x-Add-missing-vcpu-reset-functions.patch create mode 100755 kvm-s390x-Add-unpack-facility-feature-to-GA1.patch create mode 100755 kvm-s390x-Beautify-diag308-handling.patch create mode 100755 kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch create mode 100755 kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch create mode 100755 kvm-s390x-Move-clear-reset.patch create mode 100755 kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch create mode 100755 kvm-s390x-Move-initial-reset.patch create mode 100755 kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch create mode 100755 kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch create mode 100755 kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch create mode 100755 kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch create mode 100755 kvm-s390x-fix-build-for-without-default-devices.patch create mode 100755 kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch create mode 100755 kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch create mode 100755 kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch create mode 100755 kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch create mode 100755 kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch create mode 100755 kvm-s390x-protvirt-Add-migration-blocker.patch create mode 100755 kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch create mode 100755 kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch create mode 100755 kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch create mode 100755 kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch create mode 100755 kvm-s390x-protvirt-KVM-intercept-changes.patch create mode 100755 kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch create mode 100755 kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch create mode 100755 kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch create mode 100755 kvm-s390x-protvirt-SCLP-interpretation.patch create mode 100755 kvm-s390x-protvirt-Set-guest-IPL-PSW.patch create mode 100755 kvm-s390x-protvirt-Support-unpack-facility.patch create mode 100755 kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch create mode 100755 kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch create mode 100755 kvm-s390x-pv-Fix-diag318-PV-fencing.patch create mode 100755 kvm-s390x-pv-Remove-sclp-boundary-checks.patch create mode 100755 kvm-s390x-pv-Retry-ioctls-on-EINTR.patch create mode 100755 kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch create mode 100755 kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch create mode 100755 kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch create mode 100755 kvm-s390x-sigp-Fix-sense-running-reporting.patch create mode 100755 kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch create mode 100755 kvm-s390x.conf create mode 100755 kvm-scsi-make-io_timeout-configurable.patch create mode 100755 kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch create mode 100755 kvm-setup create mode 100755 kvm-setup.service create mode 100755 kvm-slirp-check-pkt_len-before-reading-protocol-header.patch create mode 100755 kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch create mode 100755 kvm-slirp-use-correct-size-while-emulating-commands.patch create mode 100755 kvm-softmmu-memory-Log-invalid-memory-accesses.patch create mode 100755 kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch create mode 100755 kvm-spapr-Allow-memory-unplug-to-always-succeed.patch create mode 100755 kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch create mode 100755 kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch create mode 100755 kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch create mode 100755 kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch create mode 100755 kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch create mode 100755 kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch create mode 100755 kvm-spapr-Improve-handling-of-fdt-buffer-size.patch create mode 100755 kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch create mode 100755 kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch create mode 100755 kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch create mode 100755 kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch create mode 100755 kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch create mode 100755 kvm-target-arm-Fix-PAuth-sbox-functions.patch create mode 100755 kvm-target-arm-arch_dump-Add-SVE-notes.patch create mode 100755 kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch create mode 100755 kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch create mode 100755 kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch create mode 100755 kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch create mode 100755 kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch create mode 100755 kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch create mode 100755 kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch create mode 100755 kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch create mode 100755 kvm-target-i386-add-a-ucode-rev-property.patch create mode 100755 kvm-target-i386-add-fast-short-REP-MOV-support.patch create mode 100755 kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch create mode 100755 kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch create mode 100755 kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch create mode 100755 kvm-target-i386-fix-TCG-UCODE_REV-access.patch create mode 100755 kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch create mode 100755 kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch create mode 100755 kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch create mode 100755 kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch create mode 100755 kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch create mode 100755 kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch create mode 100755 kvm-tcp_emu-Fix-oob-access.patch create mode 100755 kvm-tcp_emu-fix-unsafe-snprintf-usages.patch create mode 100755 kvm-tests-arm-cpu-features-Check-feature-default-values.patch create mode 100755 kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch create mode 100755 kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch create mode 100755 kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch create mode 100755 kvm-tftp-check-tftp_input-buffer-size.patch create mode 100755 kvm-tftp-introduce-a-header-structure.patch create mode 100755 kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch create mode 100755 kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch create mode 100755 kvm-tpm-ppi-page-align-PPI-RAM.patch create mode 100755 kvm-trace-update-qemu-trace-stap-to-Python-3.patch create mode 100755 kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch create mode 100755 kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch create mode 100755 kvm-udp-check-upd_input-buffer-size.patch create mode 100755 kvm-upd6-check-udp6_input-buffer-size.patch create mode 100755 kvm-usb-fix-setup_len-init-CVE-2020-14364.patch create mode 100755 kvm-usbredir-Prevent-recursion-in-usbredir_write.patch create mode 100755 kvm-util-Introduce-qemu_get_host_name.patch create mode 100755 kvm-util-add-slirp_fmt-helpers.patch create mode 100755 kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch create mode 100755 kvm-vfio-Find-DMA-available-capability.patch create mode 100755 kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch create mode 100755 kvm-vfio-ccw-Add-support-for-the-schib-region.patch create mode 100755 kvm-vfio-ccw-Connect-the-device-request-notifier.patch create mode 100755 kvm-vfio-ccw-Fix-error-message.patch create mode 100755 kvm-vfio-ccw-Refactor-ccw-irq-handler.patch create mode 100755 kvm-vfio-ccw-Refactor-cleanup-of-regions.patch create mode 100755 kvm-vfio-ccw-allow-non-prefetch-ORBs.patch create mode 100755 kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch create mode 100755 kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch create mode 100755 kvm-vhost-Add-names-to-section-rounded-warning.patch create mode 100755 kvm-vhost-Only-align-sections-for-vhost-user.patch create mode 100755 kvm-vhost-coding-style-fix.patch create mode 100755 kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch create mode 100755 kvm-vhost-user-Print-unexpected-slave-message-types.patch create mode 100755 kvm-vhost-user-fs-remove-vhostfd-property.patch create mode 100755 kvm-vhost-user-gpu-Drop-trailing-json-comma.patch create mode 100755 kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch create mode 100755 kvm-virtio-add-vhost-user-fs-ccw-device.patch create mode 100755 kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch create mode 100755 kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch create mode 100755 kvm-virtio-don-t-enable-notifications-during-polling.patch create mode 100755 kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch create mode 100755 kvm-virtio-make-virtio_delete_queue-idempotent.patch create mode 100755 kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch create mode 100755 kvm-virtio-net-fix-removal-of-failover-device.patch create mode 100755 kvm-virtio-reset-region-cache-when-on-queue-deletion.patch create mode 100755 kvm-virtiofs-Add-maintainers-entry.patch create mode 100755 kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch create mode 100755 kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch create mode 100755 kvm-virtiofsd-Add-auxiliary-.c-s.patch create mode 100755 kvm-virtiofsd-Add-fuse_lowlevel.c.patch create mode 100755 kvm-virtiofsd-Add-main-virtio-loop.patch create mode 100755 kvm-virtiofsd-Add-options-for-virtio.patch create mode 100755 kvm-virtiofsd-Add-passthrough_ll.patch create mode 100755 kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch create mode 100755 kvm-virtiofsd-Clean-up-inodes-on-destroy.patch create mode 100755 kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch create mode 100755 kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch create mode 100755 kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch create mode 100755 kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch create mode 100755 kvm-virtiofsd-Fast-path-for-virtio-read.patch create mode 100755 kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch create mode 100755 kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch create mode 100755 kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch create mode 100755 kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch create mode 100755 kvm-virtiofsd-Fix-xattr-operations.patch create mode 100755 kvm-virtiofsd-Format-imported-files-to-qemu-style.patch create mode 100755 kvm-virtiofsd-Handle-hard-reboot.patch create mode 100755 kvm-virtiofsd-Handle-reinit.patch create mode 100755 kvm-virtiofsd-Keep-track-of-replies.patch create mode 100755 kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch create mode 100755 kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch create mode 100755 kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch create mode 100755 kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch create mode 100755 kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch create mode 100755 kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch create mode 100755 kvm-virtiofsd-Poll-kick_fd-for-queue.patch create mode 100755 kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch create mode 100755 kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch create mode 100755 kvm-virtiofsd-Pull-in-upstream-headers.patch create mode 100755 kvm-virtiofsd-Remove-fuse_req_getgroups.patch create mode 100755 kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch create mode 100755 kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch create mode 100755 kvm-virtiofsd-Send-replies-to-messages.patch create mode 100755 kvm-virtiofsd-Show-submounts.patch create mode 100755 kvm-virtiofsd-Start-queue-threads.patch create mode 100755 kvm-virtiofsd-Start-reading-commands-from-queue.patch create mode 100755 kvm-virtiofsd-Start-wiring-up-vhost-user.patch create mode 100755 kvm-virtiofsd-Support-remote-posix-locks.patch create mode 100755 kvm-virtiofsd-Trim-down-imported-files.patch create mode 100755 kvm-virtiofsd-Trim-out-compatibility-code.patch create mode 100755 kvm-virtiofsd-Whitelist-fchmod.patch create mode 100755 kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch create mode 100755 kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch create mode 100755 kvm-virtiofsd-add-fuse_mbuf_iter-API.patch create mode 100755 kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch create mode 100755 kvm-virtiofsd-add-o-source-PATH-to-help-output.patch create mode 100755 kvm-virtiofsd-add-print-capabilities-option.patch create mode 100755 kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch create mode 100755 kvm-virtiofsd-add-seccomp-whitelist.patch create mode 100755 kvm-virtiofsd-add-some-options-to-the-help-message.patch create mode 100755 kvm-virtiofsd-add-syslog-command-line-option.patch create mode 100755 kvm-virtiofsd-add-thread-pool-size-NUM-option.patch create mode 100755 kvm-virtiofsd-add-vhost-user.json-file.patch create mode 100755 kvm-virtiofsd-cap-ng-helpers.patch create mode 100755 kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch create mode 100755 kvm-virtiofsd-cleanup-allocated-resource-in-se.patch create mode 100755 kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch create mode 100755 kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch create mode 100755 kvm-virtiofsd-do_read-missing-NULL-check.patch create mode 100755 kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch create mode 100755 kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch create mode 100755 kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch create mode 100755 kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch create mode 100755 kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch create mode 100755 kvm-virtiofsd-fix-error-handling-in-main.patch create mode 100755 kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch create mode 100755 kvm-virtiofsd-fix-libfuse-information-leaks.patch create mode 100755 kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch create mode 100755 kvm-virtiofsd-fix-memory-leak-on-lo.source.patch create mode 100755 kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch create mode 100755 kvm-virtiofsd-get-set-features-callbacks.patch create mode 100755 kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch create mode 100755 kvm-virtiofsd-jail-lo-proc_self_fd.patch create mode 100755 kvm-virtiofsd-load_capng-missing-unlock.patch create mode 100755 kvm-virtiofsd-make-f-foreground-the-default.patch create mode 100755 kvm-virtiofsd-make-lo_release-atomic.patch create mode 100755 kvm-virtiofsd-move-to-a-new-pid-namespace.patch create mode 100755 kvm-virtiofsd-move-to-an-empty-network-namespace.patch create mode 100755 kvm-virtiofsd-only-retain-file-system-capabilities.patch create mode 100755 kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch create mode 100755 kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch create mode 100755 kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch create mode 100755 kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch create mode 100755 kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch create mode 100755 kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch create mode 100755 kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch create mode 100755 kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch create mode 100755 kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch create mode 100755 kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch create mode 100755 kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch create mode 100755 kvm-virtiofsd-passthrough_ll-control-readdirplus.patch create mode 100755 kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch create mode 100755 kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch create mode 100755 kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch create mode 100755 kvm-virtiofsd-passthrough_ll-use-hashtable.patch create mode 100755 kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch create mode 100755 kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch create mode 100755 kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch create mode 100755 kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch create mode 100755 kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch create mode 100755 kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch create mode 100755 kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch create mode 100755 kvm-virtiofsd-process-requests-in-a-thread-pool.patch create mode 100755 kvm-virtiofsd-remove-mountpoint-dummy-argument.patch create mode 100755 kvm-virtiofsd-remove-unused-notify-reply-support.patch create mode 100755 kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch create mode 100755 kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch create mode 100755 kvm-virtiofsd-sandbox-mount-namespace.patch create mode 100755 kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch create mode 100755 kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch create mode 100755 kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch create mode 100755 kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch create mode 100755 kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch create mode 100755 kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch create mode 100755 kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch create mode 100755 kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch create mode 100755 kvm-virtiofsd-validate-path-components.patch create mode 100755 kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch create mode 100755 kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch create mode 100755 kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch create mode 100755 kvm-x86.conf create mode 100755 kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch create mode 100755 kvm-xhci-recheck-slot-status.patch create mode 100755 kvm-xics-Don-t-deassert-outputs.patch create mode 100755 kvm.conf create mode 100755 qemu-ga.sysconfig create mode 100755 qemu-guest-agent.service create mode 100755 qemu-kvm.spec create mode 100755 qemu-pr-helper.service create mode 100755 qemu-pr-helper.socket create mode 100755 udev-kvm-check.c create mode 100755 vhost.conf diff --git a/0005-Initial-redhat-build.patch b/0005-Initial-redhat-build.patch new file mode 100755 index 0000000..cde66a1 --- /dev/null +++ b/0005-Initial-redhat-build.patch @@ -0,0 +1,167 @@ +From 4df157781801c50224373be57fa3c8c3741c0535 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 12 Oct 2018 07:31:11 +0200 +Subject: Initial redhat build + +This patch introduces redhat build structure in redhat subdirectory. In addition, +several issues are fixed in QEMU tree: + + - Change of app name for sasl_server_init in VNC code from qemu to qemu-kvm + - As we use qemu-kvm as name in all places, this is updated to be consistent + - Man page renamed from qemu to qemu-kvm + - man page is installed using make install so we have to fix it in qemu tree + - Use "/share/qemu-kvm" as SHARE_SUFFIX + - We reconfigured our share to qemu-kvm to be consistent with used name + +This rebase includes changes up to qemu-kvm-4.1.0-18.el8 + +Rebase notes (3.1.0): +- added new configure options + +Rebase notes (4.0.0): +- Added dependency to perl-Test-Harness (upstream) +- Added dependency to python3-sphinx (upstream) +- Change location of icons (upstream) +- Remove .desktop file (added upstream) +- Added qemu-trace-stap (added upstream) +- Removed elf2dmp (added upstream) +- Remove .buildinfo +- Added pvh.bin rom (added upstream) +- Added interop documentation files +- Use python module instead of qemu.py (upstream) + +Rebase notes (4.1.0): +- Remove edk2 files generated by build +- Switch to rhel-8.1-candidate build target +- Remove specs documentation +- Switched from libssh2 to libssh +- Add rc0 tarball usage hacks +- Added BuildRequires for wget, rpm-build and python3-sphinx +- Removed new unpacked files +- Update configure line to use new options + +Rebase notes (4.2.0): +- Disable iotest run during make check +- README renamed to README.rst (upstream) +- Removed ui-spice-app.so +- Added relevant changes from "505f7f4 redhat: Adding slirp to the exploded tree" +- Removed qemu-ga.8 install from spec file - installed by make +- Removed spapr-rtas.bin (upstream) +- Require newer SLOF (20191022) + +Merged patches (3.1.0): +- 01f0c9f RHEL8: Add disable configure options to qemu spec file +- Spec file cleanups + +Merged patches (4.0.0): +- aa4297c Add edk2 Requires to qemu-kvm +- d124ff5779 Fixing brew build target +- eb204b5 Introduce the qemu-kvm-tests rpm +- 223cf0c Load kvm module during boot (partial) + +Merged patches (4.1.0): +- ebb6e97 redhat: Fix LOCALVERSION creation +- b0ab0cc redhat: enable tpmdev passthrough (not disabling tests) +- 7cb3c4a Enable libpmem to support nvdimm +- 8943607 qemu-kvm.spec: bump libseccomp >= 2.4.0 +- 27b7c44 rh: set CONFIG_BOCHS_DISPLAY=y for x86 (partial) +- e1fe9fe x86_64-rh-devices: enable TPM emulation (partial) + +Merged patches (4.2.0): +- 69e1fb2 enable virgla +- d4f6115 enable virgl, for real this time ... + +Signed-off-by: Danilo C. L. de Paula +--- + .gitignore | 1 + + Makefile | 3 +- + configure | 1 + + os-posix.c | 2 +- + redhat/Makefile | 82 + + redhat/Makefile.common | 51 + + redhat/README.tests | 39 + + redhat/qemu-kvm.spec.template | 2434 +++++++++++++++++++++++++++++ + redhat/scripts/process-patches.sh | 7 +- + tests/Makefile.include | 2 +- + ui/vnc.c | 2 +- + 11 files changed, 2615 insertions(+), 9 deletions(-) + create mode 100644 redhat/Makefile + create mode 100644 redhat/Makefile.common + create mode 100644 redhat/README.tests + create mode 100644 redhat/qemu-kvm.spec.template + +diff --git a/Makefile b/Makefile +index b437a346d7..086727dbb9 100644 +--- a/Makefile ++++ b/Makefile +@@ -512,6 +512,7 @@ CAP_CFLAGS += -DCAPSTONE_HAS_ARM + CAP_CFLAGS += -DCAPSTONE_HAS_ARM64 + CAP_CFLAGS += -DCAPSTONE_HAS_POWERPC + CAP_CFLAGS += -DCAPSTONE_HAS_X86 ++CAP_CFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS + + .PHONY: capstone/all + capstone/all: .git-submodule-status +@@ -826,7 +827,7 @@ install-doc: $(DOCS) install-sphinxdocs + $(INSTALL_DATA) docs/interop/qemu-qmp-ref.txt "$(DESTDIR)$(qemu_docdir)" + ifdef CONFIG_POSIX + $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1" +- $(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1" ++ $(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1/qemu-kvm.1" + $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man7" + $(INSTALL_DATA) docs/interop/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7" + $(INSTALL_DATA) docs/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7" +diff --git a/configure b/configure +index 6099be1d84..16564f8ccc 100755 +--- a/configure ++++ b/configure +@@ -2424,6 +2424,7 @@ if test "$seccomp" != "no" ; then + seccomp="no" + fi + fi ++ + ########################################## + # xen probe + +diff --git a/os-posix.c b/os-posix.c +index 86cffd2c7d..1c9f86768d 100644 +--- a/os-posix.c ++++ b/os-posix.c +@@ -83,7 +83,7 @@ void os_setup_signal_handling(void) + /* Find a likely location for support files using the location of the binary. + For installed binaries this will be "$bindir/../share/qemu". When + running from the build tree this will be "$bindir/../pc-bios". */ +-#define SHARE_SUFFIX "/share/qemu" ++#define SHARE_SUFFIX "/share/qemu-kvm" + #define BUILD_SUFFIX "/pc-bios" + char *os_find_datadir(void) + { +diff --git a/tests/Makefile.include b/tests/Makefile.include +index 8566f5f119..b483790cf3 100644 +--- a/tests/Makefile.include ++++ b/tests/Makefile.include +@@ -1194,7 +1194,7 @@ check-acceptance: check-venv $(TESTS_RESULTS_DIR) + check-qapi-schema: check-tests/qapi-schema/frontend check-tests/qapi-schema/doc-good.texi + check-qtest: $(patsubst %,check-qtest-%, $(QTEST_TARGETS)) + check-block: $(patsubst %,check-%, $(check-block-y)) +-check: check-block check-qapi-schema check-unit check-softfloat check-qtest check-decodetree ++check: check-qapi-schema check-unit check-softfloat check-qtest check-decodetree + check-clean: + rm -rf $(check-unit-y) tests/*.o $(QEMU_IOTESTS_HELPERS-y) + rm -rf $(sort $(foreach target,$(SYSEMU_TARGET_LIST), $(check-qtest-$(target)-y)) $(check-qtest-generic-y)) +diff --git a/ui/vnc.c b/ui/vnc.c +index 87b8045afe..ecf6276f5b 100644 +--- a/ui/vnc.c ++++ b/ui/vnc.c +@@ -3987,7 +3987,7 @@ void vnc_display_open(const char *id, Error **errp) + + #ifdef CONFIG_VNC_SASL + if (sasl) { +- int saslErr = sasl_server_init(NULL, "qemu"); ++ int saslErr = sasl_server_init(NULL, "qemu-kvm"); + + if (saslErr != SASL_OK) { + error_setg(errp, "Failed to initialize SASL auth: %s", +-- +2.21.0 + diff --git a/0006-Enable-disable-devices-for-RHEL.patch b/0006-Enable-disable-devices-for-RHEL.patch new file mode 100755 index 0000000..b14bb1b --- /dev/null +++ b/0006-Enable-disable-devices-for-RHEL.patch @@ -0,0 +1,994 @@ +From 67511676246cce57becbd2dcf5abccf08d9ef737 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Mon, 11 Jan 2016 11:53:33 +0100 +Subject: Enable/disable devices for RHEL + +This commit adds all changes related to changes in supported devices. + +Signed-off-by: Miroslav Rezanina + +Rebase notes (qemu 3.1.0) +- spapr_rng disabled in default_config +- new hyperv.mak in default configs +- Move changes from x86_64-softmmu.mak to i386-softmmu.mak +- Added CONFIG_VIRTIO_MMIO to aarch64-softmmu.mak +- Removed config_vga_isa.c changes as no longer needed +- Removed new devices + +Rebase notes (4.0.0): +- Added CONFIG_PCI_EXPRESS_GENERIC_BRIDGE for aarch64-softmmu.mak +- Added CONFIG_ARM_VIRT for aarch64-softmmu.mak +- Switch to KConfig (upstream) + - Using device whitelist + without-defualt-devices option + +Rebase notes (4.1.0): +- Added CONFIG_USB_OHCI_PCI for ppc64 +- Added CONFIG_XIVE_KVM for ppc64 +- Added CONFIG_ACPI_PCI for x86_64 +- Added CONFIG_SEMIHOSTING for aarch64 +- Cleanup aarch64 devices +- Do not build a15mpcore.c +- Removed ide-isa.c stub file +- Use CONFIG_USB_EHCI_PCI on x86_64 (new upstream) + +Rebase notes (4.2.0-rc0): +- Use conditional build for isa-superio.c (upstream change) +- Rename PCI_PIIX to PCI_I440FX (upstream change) + +Rebase notes (4.2.0-rc3): +- Disabled ccid-card-emulated (patch 92566) +- Disabled vfio-pci-igd-lpc-bridge (patch 92565) + +Merged patches (qemu 3.1.0): +- d51e082 Re-enable CONFIG_HYPERV_TESTDEV +- 4b889f3 Declare cirrus-vga as deprecated +- b579d32 Do not build bluetooth support +- 3eef52a Disable CONFIG_IPMI and CONFIG_I2C for ppc64 +- 9caf292 Disable CONFIG_CAN_BUS and CONFIG_CAN_SJA1000 + +Merged patches (4.1.0): +- 20a51f6 fdc: Revert downstream disablement of device "floppy" +- f869cc0 fdc: Restrict floppy controllers to RHEL-7 machine types +- 5909721 aarch64: Compile out IOH3420 +- 27b7c44 rh: set CONFIG_BOCHS_DISPLAY=y for x86 (partial) +- 495a27d x86_64-rh-devices: add missing TPM passthrough +- e1fe9fe x86_64-rh-devices: enable TPM emulation (partial) + +Merged patches (4.2.0): +- f7587dd RHEL: disable hostmem-memfd + +Signed-off-by: Danilo C. L. de Paula +--- + Makefile.objs | 4 +- + backends/Makefile.objs | 3 +- + default-configs/aarch64-rh-devices.mak | 20 +++++ + default-configs/aarch64-softmmu.mak | 10 ++- + default-configs/ppc64-rh-devices.mak | 32 ++++++++ + default-configs/ppc64-softmmu.mak | 8 +- + default-configs/rh-virtio.mak | 10 +++ + default-configs/s390x-rh-devices.mak | 15 ++++ + default-configs/s390x-softmmu.mak | 4 +- + default-configs/x86_64-rh-devices.mak | 100 +++++++++++++++++++++++++ + default-configs/x86_64-softmmu.mak | 4 +- + hw/acpi/ich9.c | 4 +- + hw/arm/Makefile.objs | 2 +- + hw/block/fdc.c | 10 +++ + hw/bt/Makefile.objs | 4 +- + hw/cpu/Makefile.objs | 5 +- + hw/display/Makefile.objs | 5 +- + hw/display/cirrus_vga.c | 3 + + hw/ide/piix.c | 5 +- + hw/input/pckbd.c | 2 + + hw/net/e1000.c | 2 + + hw/pci-host/i440fx.c | 4 + + hw/ppc/spapr_cpu_core.c | 2 + + hw/usb/Makefile.objs | 4 +- + hw/vfio/pci-quirks.c | 9 +++ + hw/vfio/pci.c | 5 ++ + qemu-options.hx | 7 +- + redhat/qemu-kvm.spec.template | 5 +- + target/arm/cpu.c | 4 +- + target/i386/cpu.c | 35 +++++++-- + target/ppc/cpu-models.c | 10 +++ + target/s390x/cpu_models.c | 3 + + target/s390x/kvm.c | 8 ++ + util/memfd.c | 2 +- + vl.c | 8 +- + 35 files changed, 317 insertions(+), 41 deletions(-) + create mode 100644 default-configs/aarch64-rh-devices.mak + create mode 100644 default-configs/ppc64-rh-devices.mak + create mode 100644 default-configs/rh-virtio.mak + create mode 100644 default-configs/s390x-rh-devices.mak + create mode 100644 default-configs/x86_64-rh-devices.mak + +diff --git a/Makefile.objs b/Makefile.objs +index 11ba1a36bd..fcf63e1096 100644 +--- a/Makefile.objs ++++ b/Makefile.objs +@@ -65,8 +65,8 @@ common-obj-y += replay/ + + common-obj-y += ui/ + common-obj-m += ui/ +-common-obj-y += bt-host.o bt-vhci.o +-bt-host.o-cflags := $(BLUEZ_CFLAGS) ++#common-obj-y += bt-host.o bt-vhci.o ++#bt-host.o-cflags := $(BLUEZ_CFLAGS) + + common-obj-y += dma-helpers.o + common-obj-y += vl.o +diff --git a/backends/Makefile.objs b/backends/Makefile.objs +index f0691116e8..f328d404bf 100644 +--- a/backends/Makefile.objs ++++ b/backends/Makefile.objs +@@ -16,4 +16,5 @@ endif + + common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o + +-common-obj-$(CONFIG_LINUX) += hostmem-memfd.o ++# RHEL: disable memfd ++# common-obj-$(CONFIG_LINUX) += hostmem-memfd.o +diff --git a/default-configs/aarch64-rh-devices.mak b/default-configs/aarch64-rh-devices.mak +new file mode 100644 +index 0000000000..a1ed641174 +--- /dev/null ++++ b/default-configs/aarch64-rh-devices.mak +@@ -0,0 +1,20 @@ ++include rh-virtio.mak ++ ++CONFIG_ARM_GIC_KVM=y ++CONFIG_ARM_SMMUV3=y ++CONFIG_ARM_V7M=y ++CONFIG_ARM_VIRT=y ++CONFIG_EDID=y ++CONFIG_PCIE_PORT=y ++CONFIG_PCI_DEVICES=y ++CONFIG_PCI_TESTDEV=y ++CONFIG_PFLASH_CFI01=y ++CONFIG_SCSI=y ++CONFIG_SEMIHOSTING=y ++CONFIG_USB=y ++CONFIG_USB_XHCI=y ++CONFIG_VFIO=y ++CONFIG_VFIO_PCI=y ++CONFIG_VIRTIO_MMIO=y ++CONFIG_VIRTIO_PCI=y ++CONFIG_XIO3130=y +diff --git a/default-configs/aarch64-softmmu.mak b/default-configs/aarch64-softmmu.mak +index 958b1e08e4..8f6867d48a 100644 +--- a/default-configs/aarch64-softmmu.mak ++++ b/default-configs/aarch64-softmmu.mak +@@ -1,8 +1,10 @@ + # Default configuration for aarch64-softmmu + + # We support all the 32 bit boards so need all their config +-include arm-softmmu.mak ++#include arm-softmmu.mak + +-CONFIG_XLNX_ZYNQMP_ARM=y +-CONFIG_XLNX_VERSAL=y +-CONFIG_SBSA_REF=y ++#CONFIG_XLNX_ZYNQMP_ARM=y ++#CONFIG_XLNX_VERSAL=y ++#CONFIG_SBSA_REF=y ++ ++include aarch64-rh-devices.mak +diff --git a/default-configs/ppc64-rh-devices.mak b/default-configs/ppc64-rh-devices.mak +new file mode 100644 +index 0000000000..35f2106d06 +--- /dev/null ++++ b/default-configs/ppc64-rh-devices.mak +@@ -0,0 +1,32 @@ ++include rh-virtio.mak ++ ++CONFIG_DIMM=y ++CONFIG_MEM_DEVICE=y ++CONFIG_PCI=y ++CONFIG_PCI_DEVICES=y ++CONFIG_PCI_TESTDEV=y ++CONFIG_PSERIES=y ++CONFIG_SCSI=y ++CONFIG_SPAPR_VSCSI=y ++CONFIG_TEST_DEVICES=y ++CONFIG_USB=y ++CONFIG_USB_OHCI=y ++CONFIG_USB_OHCI_PCI=y ++CONFIG_USB_SMARTCARD=y ++CONFIG_USB_STORAGE_BOT=y ++CONFIG_USB_XHCI=y ++CONFIG_USB_XHCI_NEC=y ++CONFIG_VFIO=y ++CONFIG_VFIO_PCI=y ++CONFIG_VGA=y ++CONFIG_VGA_PCI=y ++CONFIG_VHOST_USER=y ++CONFIG_VIRTIO_PCI=y ++CONFIG_VIRTIO_VGA=y ++CONFIG_WDT_IB6300ESB=y ++CONFIG_XICS=y ++CONFIG_XICS_KVM=y ++CONFIG_XICS_SPAPR=y ++CONFIG_XIVE=y ++CONFIG_XIVE_SPAPR=y ++CONFIG_XIVE_KVM=y +diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak +index cca52665d9..fec354f327 100644 +--- a/default-configs/ppc64-softmmu.mak ++++ b/default-configs/ppc64-softmmu.mak +@@ -1,10 +1,12 @@ + # Default configuration for ppc64-softmmu + + # Include all 32-bit boards +-include ppc-softmmu.mak ++#include ppc-softmmu.mak + + # For PowerNV +-CONFIG_POWERNV=y ++#CONFIG_POWERNV=y + + # For pSeries +-CONFIG_PSERIES=y ++#CONFIG_PSERIES=y ++ ++include ppc64-rh-devices.mak +diff --git a/default-configs/rh-virtio.mak b/default-configs/rh-virtio.mak +new file mode 100644 +index 0000000000..94ede1b5f6 +--- /dev/null ++++ b/default-configs/rh-virtio.mak +@@ -0,0 +1,10 @@ ++CONFIG_VIRTIO=y ++CONFIG_VIRTIO_BALLOON=y ++CONFIG_VIRTIO_BLK=y ++CONFIG_VIRTIO_GPU=y ++CONFIG_VIRTIO_INPUT=y ++CONFIG_VIRTIO_INPUT_HOST=y ++CONFIG_VIRTIO_NET=y ++CONFIG_VIRTIO_RNG=y ++CONFIG_VIRTIO_SCSI=y ++CONFIG_VIRTIO_SERIAL=y +diff --git a/default-configs/s390x-rh-devices.mak b/default-configs/s390x-rh-devices.mak +new file mode 100644 +index 0000000000..c3c73fe752 +--- /dev/null ++++ b/default-configs/s390x-rh-devices.mak +@@ -0,0 +1,15 @@ ++include rh-virtio.mak ++ ++CONFIG_PCI=y ++CONFIG_S390_CCW_VIRTIO=y ++CONFIG_S390_FLIC=y ++CONFIG_S390_FLIC_KVM=y ++CONFIG_SCLPCONSOLE=y ++CONFIG_SCSI=y ++CONFIG_TERMINAL3270=y ++CONFIG_VFIO=y ++CONFIG_VFIO_AP=y ++CONFIG_VFIO_PCI=y ++CONFIG_VHOST_USER=y ++CONFIG_VIRTIO_CCW=y ++CONFIG_WDT_DIAG288=y +diff --git a/default-configs/s390x-softmmu.mak b/default-configs/s390x-softmmu.mak +index f2287a133f..3e2e388e91 100644 +--- a/default-configs/s390x-softmmu.mak ++++ b/default-configs/s390x-softmmu.mak +@@ -10,4 +10,6 @@ + + # Boards: + # +-CONFIG_S390_CCW_VIRTIO=y ++#CONFIG_S390_CCW_VIRTIO=y ++ ++include s390x-rh-devices.mak +diff --git a/default-configs/x86_64-rh-devices.mak b/default-configs/x86_64-rh-devices.mak +new file mode 100644 +index 0000000000..d59b6d9bb5 +--- /dev/null ++++ b/default-configs/x86_64-rh-devices.mak +@@ -0,0 +1,100 @@ ++include rh-virtio.mak ++ ++CONFIG_AC97=y ++CONFIG_ACPI=y ++CONFIG_ACPI_PCI=y ++CONFIG_ACPI_CPU_HOTPLUG=y ++CONFIG_ACPI_MEMORY_HOTPLUG=y ++CONFIG_ACPI_NVDIMM=y ++CONFIG_ACPI_SMBUS=y ++CONFIG_ACPI_VMGENID=y ++CONFIG_ACPI_X86=y ++CONFIG_ACPI_X86_ICH=y ++CONFIG_AHCI=y ++CONFIG_APIC=y ++CONFIG_APM=y ++CONFIG_BOCHS_DISPLAY=y ++CONFIG_DIMM=y ++CONFIG_E1000E_PCI_EXPRESS=y ++CONFIG_E1000_PCI=y ++CONFIG_EDU=y ++CONFIG_FDC=y ++CONFIG_FW_CFG_DMA=y ++CONFIG_HDA=y ++CONFIG_HYPERV=y ++CONFIG_HYPERV_TESTDEV=y ++CONFIG_I2C=y ++CONFIG_I440FX=y ++CONFIG_I8254=y ++CONFIG_I8257=y ++CONFIG_I8259=y ++CONFIG_I82801B11=y ++CONFIG_IDE_CORE=y ++CONFIG_IDE_PCI=y ++CONFIG_IDE_PIIX=y ++CONFIG_IDE_QDEV=y ++CONFIG_IOAPIC=y ++CONFIG_IOH3420=y ++CONFIG_ISA_BUS=y ++CONFIG_ISA_DEBUG=y ++CONFIG_ISA_TESTDEV=y ++CONFIG_LPC_ICH9=y ++CONFIG_MC146818RTC=y ++CONFIG_MEM_DEVICE=y ++CONFIG_NVDIMM=y ++CONFIG_OPENGL=y ++CONFIG_PAM=y ++CONFIG_PC=y ++CONFIG_PCI=y ++CONFIG_PCIE_PORT=y ++CONFIG_PCI_DEVICES=y ++CONFIG_PCI_EXPRESS=y ++CONFIG_PCI_EXPRESS_Q35=y ++CONFIG_PCI_I440FX=y ++CONFIG_PCI_TESTDEV=y ++CONFIG_PCKBD=y ++CONFIG_PCSPK=y ++CONFIG_PC_ACPI=y ++CONFIG_PC_PCI=y ++CONFIG_PFLASH_CFI01=y ++CONFIG_PVPANIC=y ++CONFIG_PXB=y ++CONFIG_Q35=y ++CONFIG_QXL=y ++CONFIG_RTL8139_PCI=y ++CONFIG_SCSI=y ++CONFIG_SERIAL=y ++CONFIG_SERIAL_ISA=y ++CONFIG_SERIAL_PCI=y ++CONFIG_SEV=y ++CONFIG_SGA=y ++CONFIG_SMBIOS=y ++CONFIG_SMBUS_EEPROM=y ++CONFIG_SPICE=y ++CONFIG_TEST_DEVICES=y ++CONFIG_USB=y ++CONFIG_USB_EHCI=y ++CONFIG_USB_EHCI_PCI=y ++CONFIG_USB_SMARTCARD=y ++CONFIG_USB_STORAGE_BOT=y ++CONFIG_USB_UHCI=y ++CONFIG_USB_XHCI=y ++CONFIG_USB_XHCI_NEC=y ++CONFIG_VFIO=y ++CONFIG_VFIO_PCI=y ++CONFIG_VGA=y ++CONFIG_VGA_CIRRUS=y ++CONFIG_VGA_PCI=y ++CONFIG_VHOST_USER=y ++CONFIG_VIRTIO_PCI=y ++CONFIG_VIRTIO_VGA=y ++CONFIG_VMMOUSE=y ++CONFIG_VMPORT=y ++CONFIG_VTD=y ++CONFIG_WDT_IB6300ESB=y ++CONFIG_WDT_IB700=y ++CONFIG_XIO3130=y ++CONFIG_TPM_CRB=y ++CONFIG_TPM_TIS=y ++CONFIG_TPM_EMULATOR=y ++CONFIG_TPM_PASSTHROUGH=y +diff --git a/default-configs/x86_64-softmmu.mak b/default-configs/x86_64-softmmu.mak +index 64b2ee2960..b5de7e5279 100644 +--- a/default-configs/x86_64-softmmu.mak ++++ b/default-configs/x86_64-softmmu.mak +@@ -1,3 +1,5 @@ + # Default configuration for x86_64-softmmu + +-include i386-softmmu.mak ++#include i386-softmmu.mak ++ ++include x86_64-rh-devices.mak +diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c +index 2034dd749e..ab203ad448 100644 +--- a/hw/acpi/ich9.c ++++ b/hw/acpi/ich9.c +@@ -449,8 +449,8 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) + static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN; + pm->acpi_memory_hotplug.is_enabled = true; + pm->cpu_hotplug_legacy = true; +- pm->disable_s3 = 0; +- pm->disable_s4 = 0; ++ pm->disable_s3 = 1; ++ pm->disable_s4 = 1; + pm->s4_val = 2; + + object_property_add_uint32_ptr(obj, ACPI_PM_PROP_PM_IO_BASE, +diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs +index fe749f65fd..2aa1a9efdd 100644 +--- a/hw/arm/Makefile.objs ++++ b/hw/arm/Makefile.objs +@@ -27,7 +27,7 @@ obj-$(CONFIG_VEXPRESS) += vexpress.o + obj-$(CONFIG_ZYNQ) += xilinx_zynq.o + obj-$(CONFIG_SABRELITE) += sabrelite.o + +-obj-$(CONFIG_ARM_V7M) += armv7m.o ++#obj-$(CONFIG_ARM_V7M) += armv7m.o + obj-$(CONFIG_EXYNOS4) += exynos4210.o + obj-$(CONFIG_PXA2XX) += pxa2xx.o pxa2xx_gpio.o pxa2xx_pic.o + obj-$(CONFIG_DIGIC) += digic.o +diff --git a/hw/block/fdc.c b/hw/block/fdc.c +index ac5d31e8c1..e925bac002 100644 +--- a/hw/block/fdc.c ++++ b/hw/block/fdc.c +@@ -46,6 +46,8 @@ + #include "qemu/module.h" + #include "trace.h" + ++#include "hw/boards.h" ++ + /********************************************************/ + /* debug Floppy devices */ + +@@ -2638,6 +2640,14 @@ static void fdctrl_realize_common(DeviceState *dev, FDCtrl *fdctrl, + int i, j; + static int command_tables_inited = 0; + ++ /* Restricted for Red Hat Enterprise Linux: */ ++ MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); ++ if (!strstr(mc->name, "-rhel7.")) { ++ error_setg(errp, "Device %s is not supported with machine type %s", ++ object_get_typename(OBJECT(dev)), mc->name); ++ return; ++ } ++ + if (fdctrl->fallback == FLOPPY_DRIVE_TYPE_AUTO) { + error_setg(errp, "Cannot choose a fallback FDrive type of 'auto'"); + } +diff --git a/hw/bt/Makefile.objs b/hw/bt/Makefile.objs +index 867a7d2e8a..e678e9ee3c 100644 +--- a/hw/bt/Makefile.objs ++++ b/hw/bt/Makefile.objs +@@ -1,3 +1,3 @@ +-common-obj-y += core.o l2cap.o sdp.o hci.o hid.o +-common-obj-y += hci-csr.o ++#common-obj-y += core.o l2cap.o sdp.o hci.o hid.o ++#common-obj-y += hci-csr.o + +diff --git a/hw/cpu/Makefile.objs b/hw/cpu/Makefile.objs +index 8db9e8a7b3..1601ea93c7 100644 +--- a/hw/cpu/Makefile.objs ++++ b/hw/cpu/Makefile.objs +@@ -1,5 +1,6 @@ + obj-$(CONFIG_ARM11MPCORE) += arm11mpcore.o + obj-$(CONFIG_REALVIEW) += realview_mpcore.o + obj-$(CONFIG_A9MPCORE) += a9mpcore.o +-obj-$(CONFIG_A15MPCORE) += a15mpcore.o +-common-obj-y += core.o cluster.o ++#obj-$(CONFIG_A15MPCORE) += a15mpcore.o ++common-obj-y += core.o ++# cluster.o +diff --git a/hw/display/Makefile.objs b/hw/display/Makefile.objs +index f2182e3bef..3d0cda1b52 100644 +--- a/hw/display/Makefile.objs ++++ b/hw/display/Makefile.objs +@@ -1,8 +1,9 @@ + common-obj-$(CONFIG_DDC) += i2c-ddc.o + common-obj-$(CONFIG_EDID) += edid-generate.o edid-region.o + +-common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o +-common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o ++# Disabled for Red Hat Enterprise Linux ++#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o ++#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o + + common-obj-$(CONFIG_ADS7846) += ads7846.o + common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o +diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c +index cd283e53b4..93afa26fda 100644 +--- a/hw/display/cirrus_vga.c ++++ b/hw/display/cirrus_vga.c +@@ -2975,6 +2975,9 @@ static void pci_cirrus_vga_realize(PCIDevice *dev, Error **errp) + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + int16_t device_id = pc->device_id; + ++ warn_report("'cirrus-vga' is deprecated, " ++ "please use a different VGA card instead"); ++ + /* follow real hardware, cirrus card emulated has 4 MB video memory. + Also accept 8 MB/16 MB for backward compatibility. */ + if (s->vga.vram_size_mb != 4 && s->vga.vram_size_mb != 8 && +diff --git a/hw/ide/piix.c b/hw/ide/piix.c +index db313dd3b1..e14858ca64 100644 +--- a/hw/ide/piix.c ++++ b/hw/ide/piix.c +@@ -251,7 +251,8 @@ static void piix3_ide_class_init(ObjectClass *klass, void *data) + k->device_id = PCI_DEVICE_ID_INTEL_82371SB_1; + k->class_id = PCI_CLASS_STORAGE_IDE; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); +- dc->hotpluggable = false; ++ /* Disabled for Red Hat Enterprise Linux: */ ++ dc->user_creatable = false; + } + + static const TypeInfo piix3_ide_info = { +@@ -279,6 +280,8 @@ static void piix4_ide_class_init(ObjectClass *klass, void *data) + k->class_id = PCI_CLASS_STORAGE_IDE; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + dc->hotpluggable = false; ++ /* Disabled for Red Hat Enterprise Linux: */ ++ dc->user_creatable = false; + } + + static const TypeInfo piix4_ide_info = { +diff --git a/hw/input/pckbd.c b/hw/input/pckbd.c +index f0acfd86f7..390eb6579c 100644 +--- a/hw/input/pckbd.c ++++ b/hw/input/pckbd.c +@@ -571,6 +571,8 @@ static void i8042_class_initfn(ObjectClass *klass, void *data) + dc->realize = i8042_realizefn; + dc->vmsd = &vmstate_kbd_isa; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); ++ /* Disabled for Red Hat Enterprise Linux: */ ++ dc->user_creatable = false; + } + + static const TypeInfo i8042_info = { +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index a73f8d404e..fc73fdd6fa 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -1795,6 +1795,7 @@ static const E1000Info e1000_devices[] = { + .revision = 0x03, + .phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT, + }, ++#if 0 /* Disabled for Red Hat Enterprise Linux 7 */ + { + .name = "e1000-82544gc", + .device_id = E1000_DEV_ID_82544GC_COPPER, +@@ -1807,6 +1808,7 @@ static const E1000Info e1000_devices[] = { + .revision = 0x03, + .phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT, + }, ++#endif + }; + + static void e1000_register_types(void) +diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c +index f27131102d..17f10efae2 100644 +--- a/hw/pci-host/i440fx.c ++++ b/hw/pci-host/i440fx.c +@@ -386,6 +386,7 @@ static const TypeInfo i440fx_info = { + }, + }; + ++#if 0 /* Disabled in Red Hat Enterprise Linux */ + /* IGD Passthrough Host Bridge. */ + typedef struct { + uint8_t offset; +@@ -469,6 +470,7 @@ static const TypeInfo igd_passthrough_i440fx_info = { + .instance_size = sizeof(PCII440FXState), + .class_init = igd_passthrough_i440fx_class_init, + }; ++#endif + + static const char *i440fx_pcihost_root_bus_path(PCIHostState *host_bridge, + PCIBus *rootbus) +@@ -514,7 +516,9 @@ static const TypeInfo i440fx_pcihost_info = { + static void i440fx_register_types(void) + { + type_register_static(&i440fx_info); ++#if 0 /* Disabled in Red Hat Enterprise Linux */ + type_register_static(&igd_passthrough_i440fx_info); ++#endif + type_register_static(&i440fx_pcihost_info); + } + +diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c +index 8339c4c0f8..301cd7b4e4 100644 +--- a/hw/ppc/spapr_cpu_core.c ++++ b/hw/ppc/spapr_cpu_core.c +@@ -403,10 +403,12 @@ static const TypeInfo spapr_cpu_core_type_infos[] = { + .instance_size = sizeof(SpaprCpuCore), + .class_size = sizeof(SpaprCpuCoreClass), + }, ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + DEFINE_SPAPR_CPU_CORE_TYPE("970_v2.2"), + DEFINE_SPAPR_CPU_CORE_TYPE("970mp_v1.0"), + DEFINE_SPAPR_CPU_CORE_TYPE("970mp_v1.1"), + DEFINE_SPAPR_CPU_CORE_TYPE("power5+_v2.1"), ++#endif + DEFINE_SPAPR_CPU_CORE_TYPE("power7_v2.3"), + DEFINE_SPAPR_CPU_CORE_TYPE("power7+_v2.1"), + DEFINE_SPAPR_CPU_CORE_TYPE("power8_v2.0"), +diff --git a/hw/usb/Makefile.objs b/hw/usb/Makefile.objs +index 303ac084a0..700a91886e 100644 +--- a/hw/usb/Makefile.objs ++++ b/hw/usb/Makefile.objs +@@ -30,7 +30,9 @@ common-obj-$(CONFIG_USB_BLUETOOTH) += dev-bluetooth.o + ifeq ($(CONFIG_USB_SMARTCARD),y) + common-obj-y += dev-smartcard-reader.o + common-obj-$(CONFIG_SMARTCARD) += smartcard.mo +-smartcard.mo-objs := ccid-card-passthru.o ccid-card-emulated.o ++# Disabled for Red Hat Enterprise Linux: ++# smartcard.mo-objs := ccid-card-passthru.o ccid-card-emulated.o ++smartcard.mo-objs := ccid-card-passthru.o + smartcard.mo-cflags := $(SMARTCARD_CFLAGS) + smartcard.mo-libs := $(SMARTCARD_LIBS) + endif +diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c +index 136f3a9ad6..4505ffe48a 100644 +--- a/hw/vfio/pci-quirks.c ++++ b/hw/vfio/pci-quirks.c +@@ -1166,6 +1166,7 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) + trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + /* + * Intel IGD support + * +@@ -1239,6 +1240,7 @@ static int igd_gen(VFIOPCIDevice *vdev) + + return 8; /* Assume newer is compatible */ + } ++#endif + + typedef struct VFIOIGDQuirk { + struct VFIOPCIDevice *vdev; +@@ -1311,6 +1313,7 @@ typedef struct { + uint8_t len; + } IGDHostInfo; + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static const IGDHostInfo igd_host_bridge_infos[] = { + {PCI_REVISION_ID, 2}, + {PCI_SUBSYSTEM_VENDOR_ID, 2}, +@@ -1559,9 +1562,11 @@ static const MemoryRegionOps vfio_igd_index_quirk = { + .write = vfio_igd_quirk_index_write, + .endianness = DEVICE_LITTLE_ENDIAN, + }; ++#endif + + static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) + { ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + struct vfio_region_info *rom = NULL, *opregion = NULL, + *host = NULL, *lpc = NULL; + VFIOQuirk *quirk; +@@ -1572,6 +1577,7 @@ static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) + uint32_t gmch; + uint16_t cmd_orig, cmd; + Error *err = NULL; ++#endif + + /* + * This must be an Intel VGA device at address 00:02.0 for us to even +@@ -1585,6 +1591,8 @@ static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) + return; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ ++ + /* + * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we + * can stuff host values into, so if there's already one there and it's not +@@ -1809,6 +1817,7 @@ out: + g_free(opregion); + g_free(host); + g_free(lpc); ++#endif + } + + /* +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 2d40b396f2..c8534d3035 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3220,6 +3220,7 @@ static const TypeInfo vfio_pci_dev_info = { + }, + }; + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static Property vfio_pci_dev_nohotplug_properties[] = { + DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), + DEFINE_PROP_END_OF_LIST(), +@@ -3239,11 +3240,15 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { + .instance_size = sizeof(VFIOPCIDevice), + .class_init = vfio_pci_nohotplug_dev_class_init, + }; ++#endif + + static void register_vfio_pci_dev_type(void) + { + type_register_static(&vfio_pci_dev_info); ++ ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + type_register_static(&vfio_pci_nohotplug_dev_info); ++#endif + } + + type_init(register_vfio_pci_dev_type) +diff --git a/qemu-options.hx b/qemu-options.hx +index 65c9473b73..fc17aca631 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -2111,11 +2111,6 @@ ETEXI + + DEF("no-hpet", 0, QEMU_OPTION_no_hpet, + "-no-hpet disable HPET\n", QEMU_ARCH_I386) +-STEXI +-@item -no-hpet +-@findex -no-hpet +-Disable HPET support. +-ETEXI + + DEF("acpitable", HAS_ARG, QEMU_OPTION_acpitable, + "-acpitable [sig=str][,rev=n][,oem_id=str][,oem_table_id=str][,oem_rev=n][,asl_compiler_id=str][,asl_compiler_rev=n][,{data|file}=file1[:file2]...]\n" +@@ -3125,6 +3120,7 @@ STEXI + ETEXI + DEFHEADING() + ++#if 0 + DEFHEADING(Bluetooth(R) options:) + STEXI + @table @option +@@ -3203,6 +3199,7 @@ STEXI + @end table + ETEXI + DEFHEADING() ++#endif + + #ifdef CONFIG_TPM + DEFHEADING(TPM device options:) +diff --git a/target/arm/cpu.c b/target/arm/cpu.c +index 7a4ac9339b..3788fc3c4a 100644 +--- a/target/arm/cpu.c ++++ b/target/arm/cpu.c +@@ -2744,7 +2744,9 @@ static void arm_cpu_register_types(void) + type_register_static(&idau_interface_type_info); + + while (info->name) { +- cpu_register(info); ++ /* RHEL specific: Filter out unsupported cpu models */ ++ if (!strcmp(info->name, "cortex-a15")) ++ cpu_register(info); + info++; + } + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 69f518a21a..1b7880ae3a 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1835,14 +1835,14 @@ static X86CPUDefinition builtin_x86_defs[] = { + .family = 6, + .model = 6, + .stepping = 3, +- .features[FEAT_1_EDX] = +- PPRO_FEATURES | +- CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA | +- CPUID_PSE36, +- .features[FEAT_1_ECX] = +- CPUID_EXT_SSE3 | CPUID_EXT_CX16, +- .features[FEAT_8000_0001_EDX] = +- CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, ++ .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | ++ CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | ++ CPUID_MCA | CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | ++ CPUID_CX8 | CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | ++ CPUID_PSE | CPUID_DE | CPUID_FP87, ++ .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, ++ .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM, + .xlevel = 0x8000000A, +@@ -2128,6 +2128,25 @@ static X86CPUDefinition builtin_x86_defs[] = { + .xlevel = 0x80000008, + .model_id = "Intel(R) Atom(TM) CPU N270 @ 1.60GHz", + }, ++ { ++ .name = "cpu64-rhel6", ++ .level = 4, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 6, ++ .model = 13, ++ .stepping = 3, ++ .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | ++ CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | ++ CPUID_MCA | CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | ++ CPUID_CX8 | CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | ++ CPUID_PSE | CPUID_DE | CPUID_FP87, ++ .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, ++ .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, ++ .xlevel = 0x8000000A, ++ .model_id = "QEMU Virtual CPU version (cpu64-rhel6)", ++ }, + { + .name = "Conroe", + .level = 10, +diff --git a/target/ppc/cpu-models.c b/target/ppc/cpu-models.c +index 086548e9b9..1bbf378c18 100644 +--- a/target/ppc/cpu-models.c ++++ b/target/ppc/cpu-models.c +@@ -66,6 +66,7 @@ + #define POWERPC_DEF(_name, _pvr, _type, _desc) \ + POWERPC_DEF_SVR(_name, _desc, _pvr, POWERPC_SVR_NONE, _type) + ++#if 0 /* Embedded and 32-bit CPUs disabled for Red Hat Enterprise Linux */ + /* Embedded PowerPC */ + /* PowerPC 401 family */ + POWERPC_DEF("401", CPU_POWERPC_401, 401, +@@ -740,8 +741,10 @@ + "PowerPC 7447A v1.2 (G4)") + POWERPC_DEF("7457a_v1.2", CPU_POWERPC_74x7A_v12, 7455, + "PowerPC 7457A v1.2 (G4)") ++#endif + /* 64 bits PowerPC */ + #if defined(TARGET_PPC64) ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + POWERPC_DEF("970_v2.2", CPU_POWERPC_970_v22, 970, + "PowerPC 970 v2.2") + POWERPC_DEF("970fx_v1.0", CPU_POWERPC_970FX_v10, 970, +@@ -760,6 +763,7 @@ + "PowerPC 970MP v1.1") + POWERPC_DEF("power5+_v2.1", CPU_POWERPC_POWER5P_v21, POWER5P, + "POWER5+ v2.1") ++#endif + POWERPC_DEF("power7_v2.3", CPU_POWERPC_POWER7_v23, POWER7, + "POWER7 v2.3") + POWERPC_DEF("power7+_v2.1", CPU_POWERPC_POWER7P_v21, POWER7, +@@ -780,6 +784,7 @@ + /* PowerPC CPU aliases */ + + PowerPCCPUAlias ppc_cpu_aliases[] = { ++#if 0 /* Embedded and 32-bit CPUs disabled for Red Hat Enterprise Linux */ + { "403", "403gc" }, + { "405", "405d4" }, + { "405cr", "405crc" }, +@@ -938,12 +943,15 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { + { "7447a", "7447a_v1.2" }, + { "7457a", "7457a_v1.2" }, + { "apollo7pm", "7457a_v1.0" }, ++#endif + #if defined(TARGET_PPC64) ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + { "970", "970_v2.2" }, + { "970fx", "970fx_v3.1" }, + { "970mp", "970mp_v1.1" }, + { "power5+", "power5+_v2.1" }, + { "power5gs", "power5+_v2.1" }, ++#endif + { "power7", "power7_v2.3" }, + { "power7+", "power7+_v2.1" }, + { "power8e", "power8e_v2.1" }, +@@ -952,6 +960,7 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { + { "power9", "power9_v2.0" }, + #endif + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + /* Generic PowerPCs */ + #if defined(TARGET_PPC64) + { "ppc64", "970fx_v3.1" }, +@@ -959,5 +968,6 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { + { "ppc32", "604" }, + { "ppc", "604" }, + { "default", "604" }, ++#endif + { NULL, NULL } + }; +diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c +index 7e92fb2e15..be718220d7 100644 +--- a/target/s390x/cpu_models.c ++++ b/target/s390x/cpu_models.c +@@ -404,6 +404,9 @@ static void check_unavailable_features(const S390CPUModel *max_model, + (max_model->def->gen == model->def->gen && + max_model->def->ec_ga < model->def->ec_ga)) { + list_add_feat("type", unavailable); ++ } else if (model->def->gen < 11 && kvm_enabled()) { ++ /* Older CPU models are not supported on Red Hat Enterprise Linux */ ++ list_add_feat("type", unavailable); + } + + /* detect missing features if any to properly report them */ +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 0c9d14b4b1..a02d569537 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2387,6 +2387,14 @@ void kvm_s390_apply_cpu_model(const S390CPUModel *model, Error **errp) + error_setg(errp, "KVM doesn't support CPU models"); + return; + } ++ ++ /* Older CPU models are not supported on Red Hat Enterprise Linux */ ++ if (model->def->gen < 11) { ++ error_setg(errp, "KVM: Unsupported CPU type specified: %s", ++ MACHINE(qdev_get_machine())->cpu_type); ++ return; ++ } ++ + prop.cpuid = s390_cpuid_from_cpu_model(model); + prop.ibc = s390_ibc_from_cpu_model(model); + /* configure cpu features indicated via STFL(e) */ +diff --git a/util/memfd.c b/util/memfd.c +index 4a3c07e0be..3303ec9da4 100644 +--- a/util/memfd.c ++++ b/util/memfd.c +@@ -193,7 +193,7 @@ bool qemu_memfd_alloc_check(void) + */ + bool qemu_memfd_check(unsigned int flags) + { +-#ifdef CONFIG_LINUX ++#if 0 /* RHEL: memfd support disabled */ + int mfd = memfd_create("test", flags | MFD_CLOEXEC); + + if (mfd >= 0) { +diff --git a/vl.c b/vl.c +index 6a65a64bfd..668a34577e 100644 +--- a/vl.c ++++ b/vl.c +@@ -166,7 +166,7 @@ Chardev *parallel_hds[MAX_PARALLEL_PORTS]; + int win2k_install_hack = 0; + int singlestep = 0; + int acpi_enabled = 1; +-int no_hpet = 0; ++int no_hpet = 1; /* Always disabled for Red Hat Enterprise Linux */ + int fd_bootchk = 1; + static int no_reboot; + int no_shutdown = 0; +@@ -914,6 +914,7 @@ static void configure_rtc(QemuOpts *opts) + } + } + ++#if 0 // Disabled for Red Hat Enterprise Linux + /***********************************************************/ + /* Bluetooth support */ + static int nb_hcis; +@@ -1035,6 +1036,7 @@ static int bt_parse(const char *opt) + error_report("bad bluetooth parameter '%s'", opt); + return 1; + } ++#endif + + static int parse_name(void *opaque, QemuOpts *opts, Error **errp) + { +@@ -3128,6 +3130,7 @@ int main(int argc, char **argv, char **envp) + } + break; + #endif ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + case QEMU_OPTION_bt: + warn_report("The bluetooth subsystem is deprecated and will " + "be removed soon. If the bluetooth subsystem is " +@@ -3135,6 +3138,7 @@ int main(int argc, char **argv, char **envp) + "qemu-devel@nongnu.org with your usecase."); + add_device_config(DEV_BT, optarg); + break; ++#endif + case QEMU_OPTION_audio_help: + audio_legacy_help(); + exit (0); +@@ -4282,9 +4286,11 @@ int main(int argc, char **argv, char **envp) + + tpm_init(); + ++#if 0 // Disabled for Red Hat Enterprise Linux + /* init the bluetooth world */ + if (foreach_device_config(DEV_BT, bt_parse)) + exit(1); ++#endif + + if (!xen_enabled()) { + /* On 32-bit hosts, QEMU is limited by virtual address space */ +-- +2.21.0 + diff --git a/0007-Machine-type-related-general-changes.patch b/0007-Machine-type-related-general-changes.patch new file mode 100755 index 0000000..4ae3966 --- /dev/null +++ b/0007-Machine-type-related-general-changes.patch @@ -0,0 +1,675 @@ +From 113078b23a4747b07eb363719d7cbc0af403dd2a Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 11 Jan 2019 09:54:45 +0100 +Subject: Machine type related general changes + +This patch is first part of original "Add RHEL machine types" patch we +split to allow easier review. It contains changes not related to any +architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- Remove e1000 device duplication changes to reflect upstream solution +- Rewrite machine compat properties to upstream solution + +Rebase changes (4.1.0): +- Removed optional flag for machine compat properties (upstream) +- Remove c3e002cb chunk from hw/net/e1000.c +- Reorder compat structures +- Use one format for compat scructures +- Added compat for virtio-balloon-pci.any_layout for rhel71 + +Merged patches (4.0.0): +- d4c0957 compat: Generic HW_COMPAT_RHEL7_6 +- cbac773 virtio: Make disable-legacy/disable-modern compat properties optional + +Merged patches (4.1.0): +- 479ad30 redhat: fix cut'n'paste garbage in hw_compat comments +- f19738e compat: Generic hw_compat_rhel_8_0 + +Merged patches (4.2.0): +- 9f2bfaa machine types: Update hw_compat_rhel_8_0 from hw_compat_4_0 +- ca4a5e8 virtio: Make disable-legacy/disable-modern compat properties optional +- compat: Generic hw_compat_rhel_8_1 (patch 93040/92956) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/ich9.c | 16 ++++ + hw/acpi/piix4.c | 5 +- + hw/char/serial.c | 16 ++++ + hw/core/machine.c | 170 ++++++++++++++++++++++++++++++++++++++++ + hw/display/vga-isa.c | 2 +- + hw/net/e1000e.c | 21 +++++ + hw/net/rtl8139.c | 4 +- + hw/rtc/mc146818rtc.c | 6 ++ + hw/smbios/smbios.c | 1 + + hw/timer/i8254_common.c | 2 +- + hw/usb/hcd-uhci.c | 4 +- + hw/usb/hcd-xhci.c | 20 +++++ + hw/usb/hcd-xhci.h | 2 + + include/hw/acpi/ich9.h | 3 + + include/hw/boards.h | 24 ++++++ + include/hw/usb.h | 4 + + migration/migration.c | 2 + + migration/migration.h | 5 ++ + 18 files changed, 301 insertions(+), 6 deletions(-) + +diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c +index ab203ad448..7ec26884e8 100644 +--- a/hw/acpi/ich9.c ++++ b/hw/acpi/ich9.c +@@ -444,6 +444,18 @@ static void ich9_pm_set_enable_tco(Object *obj, bool value, Error **errp) + s->pm.enable_tco = value; + } + ++static bool ich9_pm_get_force_rev1_fadt(Object *obj, Error **errp) ++{ ++ ICH9LPCState *s = ICH9_LPC_DEVICE(obj); ++ return s->pm.force_rev1_fadt; ++} ++ ++static void ich9_pm_set_force_rev1_fadt(Object *obj, bool value, Error **errp) ++{ ++ ICH9LPCState *s = ICH9_LPC_DEVICE(obj); ++ s->pm.force_rev1_fadt = value; ++} ++ + void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) + { + static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN; +@@ -468,6 +480,10 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) + ich9_pm_get_cpu_hotplug_legacy, + ich9_pm_set_cpu_hotplug_legacy, + NULL); ++ object_property_add_bool(obj, "__com.redhat_force-rev1-fadt", ++ ich9_pm_get_force_rev1_fadt, ++ ich9_pm_set_force_rev1_fadt, ++ NULL); + object_property_add(obj, ACPI_PM_PROP_S3_DISABLED, "uint8", + ich9_pm_get_disable_s3, + ich9_pm_set_disable_s3, +diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c +index 93aec2dd2c..3a26193cbe 100644 +--- a/hw/acpi/piix4.c ++++ b/hw/acpi/piix4.c +@@ -274,6 +274,7 @@ static const VMStateDescription vmstate_acpi = { + .name = "piix4_pm", + .version_id = 3, + .minimum_version_id = 3, ++ .minimum_version_id = 2, + .post_load = vmstate_acpi_post_load, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, PIIX4PMState), +@@ -627,8 +628,8 @@ static void piix4_send_gpe(AcpiDeviceIf *adev, AcpiEventStatusBits ev) + + static Property piix4_pm_properties[] = { + DEFINE_PROP_UINT32("smb_io_base", PIIX4PMState, smb_io_base, 0), +- DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 0), +- DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 0), ++ DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 1), ++ DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 1), + DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2), + DEFINE_PROP_BOOL("acpi-pci-hotplug-with-bridge-support", PIIX4PMState, + use_acpi_pci_hotplug, true), +diff --git a/hw/char/serial.c b/hw/char/serial.c +index b4aa250950..0012f0e44d 100644 +--- a/hw/char/serial.c ++++ b/hw/char/serial.c +@@ -34,6 +34,7 @@ + #include "sysemu/runstate.h" + #include "qemu/error-report.h" + #include "trace.h" ++#include "migration/migration.h" + + //#define DEBUG_SERIAL + +@@ -703,6 +704,9 @@ static int serial_post_load(void *opaque, int version_id) + static bool serial_thr_ipending_needed(void *opaque) + { + SerialState *s = opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } + + if (s->ier & UART_IER_THRI) { + bool expected_value = ((s->iir & UART_IIR_ID) == UART_IIR_THRI); +@@ -784,6 +788,10 @@ static const VMStateDescription vmstate_serial_xmit_fifo = { + static bool serial_fifo_timeout_timer_needed(void *opaque) + { + SerialState *s = (SerialState *)opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return timer_pending(s->fifo_timeout_timer); + } + +@@ -801,6 +809,10 @@ static const VMStateDescription vmstate_serial_fifo_timeout_timer = { + static bool serial_timeout_ipending_needed(void *opaque) + { + SerialState *s = (SerialState *)opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return s->timeout_ipending != 0; + } + +@@ -818,6 +830,10 @@ static const VMStateDescription vmstate_serial_timeout_ipending = { + static bool serial_poll_needed(void *opaque) + { + SerialState *s = (SerialState *)opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return s->poll_msl >= 0; + } + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index 1689ad3bf8..e0e0eec8bf 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -27,6 +27,176 @@ + #include "hw/pci/pci.h" + #include "hw/mem/nvdimm.h" + ++/* ++ * The same as hw_compat_4_1 ++ */ ++GlobalProperty hw_compat_rhel_8_1[] = { ++ /* hw_compat_rhel_8_1 from hw_compat_4_1 */ ++ { "virtio-pci", "x-pcie-flr-init", "off" }, ++}; ++const size_t hw_compat_rhel_8_1_len = G_N_ELEMENTS(hw_compat_rhel_8_1); ++ ++/* The same as hw_compat_3_1 ++ * format of array has been changed by: ++ * 6c36bddf5340 ("machine: Use shorter format for GlobalProperty arrays") ++ */ ++GlobalProperty hw_compat_rhel_8_0[] = { ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "pcie-root-port", "x-speed", "2_5" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "pcie-root-port", "x-width", "1" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "memory-backend-file", "x-use-canonical-path-for-ramblock-id", "true" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "memory-backend-memfd", "x-use-canonical-path-for-ramblock-id", "true" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "tpm-crb", "ppi", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "tpm-tis", "ppi", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "usb-kbd", "serial", "42" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "usb-mouse", "serial", "42" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "usb-tablet", "serial", "42" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "virtio-blk-device", "discard", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "virtio-blk-device", "write-zeroes", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "VGA", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "secondary-vga", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "bochs-display", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "virtio-vga", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "virtio-gpu-pci", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "virtio-device", "use-started", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 - that was added in 4.1 */ ++ { "pcie-root-port-base", "disable-acs", "true" }, ++}; ++const size_t hw_compat_rhel_8_0_len = G_N_ELEMENTS(hw_compat_rhel_8_0); ++ ++/* The same as hw_compat_3_0 + hw_compat_2_12 ++ * except that ++ * there's nothing in 3_0 ++ * migration.decompress-error-check=off was in 7.5 from bz 1584139 ++ */ ++GlobalProperty hw_compat_rhel_7_6[] = { ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "hda-audio", "use-timer", "false" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "cirrus-vga", "global-vmstate", "true" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "VGA", "global-vmstate", "true" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "vmware-svga", "global-vmstate", "true" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "qxl-vga", "global-vmstate", "true" }, ++}; ++const size_t hw_compat_rhel_7_6_len = G_N_ELEMENTS(hw_compat_rhel_7_6); ++ ++/* The same as hw_compat_2_11 + hw_compat_2_10 */ ++GlobalProperty hw_compat_rhel_7_5[] = { ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 */ ++ { "hpet", "hpet-offset-saved", "false" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 */ ++ { "virtio-blk-pci", "vectors", "2" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 */ ++ { "vhost-user-blk-pci", "vectors", "2" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 ++ bz 1608778 modified for our naming */ ++ { "e1000-82540em", "migrate_tso_props", "off" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_10 */ ++ { "virtio-mouse-device", "wheel-axis", "false" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_10 */ ++ { "virtio-tablet-device", "wheel-axis", "false" }, ++ { "cirrus-vga", "vgamem_mb", "16" }, ++ { "migration", "decompress-error-check", "off" }, ++}; ++const size_t hw_compat_rhel_7_5_len = G_N_ELEMENTS(hw_compat_rhel_7_5); ++ ++/* Mostly like hw_compat_2_9 except ++ * x-mtu-bypass-backend, x-migrate-msix has already been ++ * backported to RHEL7.4. shpc was already on in 7.4. ++ */ ++GlobalProperty hw_compat_rhel_7_4[] = { ++ { "intel-iommu", "pt", "off" }, ++}; ++ ++const size_t hw_compat_rhel_7_4_len = G_N_ELEMENTS(hw_compat_rhel_7_4); ++/* Mostly like HW_COMPAT_2_6 + HW_COMPAT_2_7 + HW_COMPAT_2_8 except ++ * disable-modern, disable-legacy, page-per-vq have already been ++ * backported to RHEL7.3 ++ */ ++GlobalProperty hw_compat_rhel_7_3[] = { ++ { "virtio-mmio", "format_transport_address", "off" }, ++ { "virtio-serial-device", "emergency-write", "off" }, ++ { "ioapic", "version", "0x11" }, ++ { "intel-iommu", "x-buggy-eim", "true" }, ++ { "virtio-pci", "x-ignore-backend-features", "on" }, ++ { "fw_cfg_mem", "x-file-slots", stringify(0x10) }, ++ { "fw_cfg_io", "x-file-slots", stringify(0x10) }, ++ { "pflash_cfi01", "old-multiple-chip-handling", "on" }, ++ { TYPE_PCI_DEVICE, "x-pcie-extcap-init", "off" }, ++ { "virtio-pci", "x-pcie-deverr-init", "off" }, ++ { "virtio-pci", "x-pcie-lnkctl-init", "off" }, ++ { "virtio-pci", "x-pcie-pm-init", "off" }, ++ { "virtio-net-device", "x-mtu-bypass-backend", "off" }, ++ { "e1000e", "__redhat_e1000e_7_3_intr_state", "on" }, ++}; ++const size_t hw_compat_rhel_7_3_len = G_N_ELEMENTS(hw_compat_rhel_7_3); ++ ++/* Mostly like hw_compat_2_4 + 2_3 but: ++ * we don't need "any_layout" as it has been backported to 7.2 ++ */ ++GlobalProperty hw_compat_rhel_7_2[] = { ++ { "virtio-blk-device", "scsi", "true" }, ++ { "e1000-82540em", "extra_mac_registers", "off" }, ++ { "virtio-pci", "x-disable-pcie", "on" }, ++ { "virtio-pci", "migrate-extra", "off" }, ++ { "fw_cfg_mem", "dma_enabled", "off" }, ++ { "fw_cfg_io", "dma_enabled", "off" }, ++ { "isa-fdc", "fallback", "144" }, ++ /* Optional because not all virtio-pci devices support legacy mode */ ++ { "virtio-pci", "disable-modern", "on", .optional = true }, ++ { "virtio-pci", "disable-legacy", "off", .optional = true }, ++ { TYPE_PCI_DEVICE, "x-pcie-lnksta-dllla", "off" }, ++ { "virtio-pci", "page-per-vq", "on" }, ++ /* hw_compat_rhel_7_2 - introduced with 2.10.0 */ ++ { "migration", "send-section-footer", "off" }, ++ /* hw_compat_rhel_7_2 - introduced with 2.10.0 */ ++ { "migration", "store-global-state", "off", ++ }, ++}; ++const size_t hw_compat_rhel_7_2_len = G_N_ELEMENTS(hw_compat_rhel_7_2); ++ ++/* Mostly like hw_compat_2_1 but: ++ * we don't need virtio-scsi-pci since 7.0 already had that on ++ * ++ * RH: Note, qemu-extended-regs should have been enabled in the 7.1 ++ * machine type, but was accidentally turned off in 7.2 onwards. ++ */ ++GlobalProperty hw_compat_rhel_7_1[] = { ++ { "intel-hda-generic", "old_msi_addr", "on" }, ++ { "VGA", "qemu-extended-regs", "off" }, ++ { "secondary-vga", "qemu-extended-regs", "off" }, ++ { "usb-mouse", "usb_version", stringify(1) }, ++ { "usb-kbd", "usb_version", stringify(1) }, ++ { "virtio-pci", "virtio-pci-bus-master-bug-migration", "on" }, ++ { "virtio-blk-pci", "any_layout", "off" }, ++ { "virtio-balloon-pci", "any_layout", "off" }, ++ { "virtio-serial-pci", "any_layout", "off" }, ++ { "virtio-9p-pci", "any_layout", "off" }, ++ { "virtio-rng-pci", "any_layout", "off" }, ++ /* HW_COMPAT_RHEL7_1 - introduced with 2.10.0 */ ++ { "migration", "send-configuration", "off" }, ++}; ++const size_t hw_compat_rhel_7_1_len = G_N_ELEMENTS(hw_compat_rhel_7_1); ++ + GlobalProperty hw_compat_4_1[] = { + { "virtio-pci", "x-pcie-flr-init", "off" }, + }; +diff --git a/hw/display/vga-isa.c b/hw/display/vga-isa.c +index 873e5e9706..d1a2efe47e 100644 +--- a/hw/display/vga-isa.c ++++ b/hw/display/vga-isa.c +@@ -82,7 +82,7 @@ static void vga_isa_realizefn(DeviceState *dev, Error **errp) + } + + static Property vga_isa_properties[] = { +- DEFINE_PROP_UINT32("vgamem_mb", ISAVGAState, state.vram_size_mb, 8), ++ DEFINE_PROP_UINT32("vgamem_mb", ISAVGAState, state.vram_size_mb, 16), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c +index b69fd7d8ad..d8be50a1ce 100644 +--- a/hw/net/e1000e.c ++++ b/hw/net/e1000e.c +@@ -79,6 +79,11 @@ typedef struct E1000EState { + + E1000ECore core; + ++ /* 7.3 had the intr_state field that was in the original e1000e code ++ * but that was removed prior to 2.7's release ++ */ ++ bool redhat_7_3_intr_state_enable; ++ uint32_t redhat_7_3_intr_state; + } E1000EState; + + #define E1000E_MMIO_IDX 0 +@@ -94,6 +99,10 @@ typedef struct E1000EState { + #define E1000E_MSIX_TABLE (0x0000) + #define E1000E_MSIX_PBA (0x2000) + ++/* Values as in RHEL 7.3 build and original upstream */ ++#define RH_E1000E_USE_MSI BIT(0) ++#define RH_E1000E_USE_MSIX BIT(1) ++ + static uint64_t + e1000e_mmio_read(void *opaque, hwaddr addr, unsigned size) + { +@@ -305,6 +314,8 @@ e1000e_init_msix(E1000EState *s) + } else { + if (!e1000e_use_msix_vectors(s, E1000E_MSIX_VEC_NUM)) { + msix_uninit(d, &s->msix, &s->msix); ++ } else { ++ s->redhat_7_3_intr_state |= RH_E1000E_USE_MSIX; + } + } + } +@@ -476,6 +487,8 @@ static void e1000e_pci_realize(PCIDevice *pci_dev, Error **errp) + ret = msi_init(PCI_DEVICE(s), 0xD0, 1, true, false, NULL); + if (ret) { + trace_e1000e_msi_init_fail(ret); ++ } else { ++ s->redhat_7_3_intr_state |= RH_E1000E_USE_MSI; + } + + if (e1000e_add_pm_capability(pci_dev, e1000e_pmrb_offset, +@@ -599,6 +612,11 @@ static const VMStateDescription e1000e_vmstate_intr_timer = { + VMSTATE_STRUCT_ARRAY(_f, _s, _num, 0, \ + e1000e_vmstate_intr_timer, E1000IntrDelayTimer) + ++static bool rhel_7_3_check(void *opaque, int version_id) ++{ ++ return ((E1000EState *)opaque)->redhat_7_3_intr_state_enable; ++} ++ + static const VMStateDescription e1000e_vmstate = { + .name = "e1000e", + .version_id = 1, +@@ -610,6 +628,7 @@ static const VMStateDescription e1000e_vmstate = { + VMSTATE_MSIX(parent_obj, E1000EState), + + VMSTATE_UINT32(ioaddr, E1000EState), ++ VMSTATE_UINT32_TEST(redhat_7_3_intr_state, E1000EState, rhel_7_3_check), + VMSTATE_UINT32(core.rxbuf_min_shift, E1000EState), + VMSTATE_UINT8(core.rx_desc_len, E1000EState), + VMSTATE_UINT32_ARRAY(core.rxbuf_sizes, E1000EState, +@@ -658,6 +677,8 @@ static PropertyInfo e1000e_prop_disable_vnet, + + static Property e1000e_properties[] = { + DEFINE_NIC_PROPERTIES(E1000EState, conf), ++ DEFINE_PROP_BOOL("__redhat_e1000e_7_3_intr_state", E1000EState, ++ redhat_7_3_intr_state_enable, false), + DEFINE_PROP_SIGNED("disable_vnet_hdr", E1000EState, disable_vnet, false, + e1000e_prop_disable_vnet, bool), + DEFINE_PROP_SIGNED("subsys_ven", E1000EState, subsys_ven, +diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c +index 88a97d756d..21d80e96cf 100644 +--- a/hw/net/rtl8139.c ++++ b/hw/net/rtl8139.c +@@ -3177,7 +3177,7 @@ static int rtl8139_pre_save(void *opaque) + + static const VMStateDescription vmstate_rtl8139 = { + .name = "rtl8139", +- .version_id = 5, ++ .version_id = 4, + .minimum_version_id = 3, + .post_load = rtl8139_post_load, + .pre_save = rtl8139_pre_save, +@@ -3258,7 +3258,9 @@ static const VMStateDescription vmstate_rtl8139 = { + VMSTATE_UINT32(tally_counters.TxMCol, RTL8139State), + VMSTATE_UINT64(tally_counters.RxOkPhy, RTL8139State), + VMSTATE_UINT64(tally_counters.RxOkBrd, RTL8139State), ++#if 0 /* Disabled for Red Hat Enterprise Linux bz 1420195 */ + VMSTATE_UINT32_V(tally_counters.RxOkMul, RTL8139State, 5), ++#endif + VMSTATE_UINT16(tally_counters.TxAbt, RTL8139State), + VMSTATE_UINT16(tally_counters.TxUndrn, RTL8139State), + +diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c +index 74ae74bc5c..73820517df 100644 +--- a/hw/rtc/mc146818rtc.c ++++ b/hw/rtc/mc146818rtc.c +@@ -42,6 +42,7 @@ + #include "qapi/visitor.h" + #include "exec/address-spaces.h" + #include "hw/rtc/mc146818rtc_regs.h" ++#include "migration/migration.h" + + #ifdef TARGET_I386 + #include "qapi/qapi-commands-misc-target.h" +@@ -820,6 +821,11 @@ static int rtc_post_load(void *opaque, int version_id) + static bool rtc_irq_reinject_on_ack_count_needed(void *opaque) + { + RTCState *s = (RTCState *)opaque; ++ ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return s->irq_reinject_on_ack_count != 0; + } + +diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c +index 11d476c4a2..e6e9355384 100644 +--- a/hw/smbios/smbios.c ++++ b/hw/smbios/smbios.c +@@ -777,6 +777,7 @@ void smbios_set_defaults(const char *manufacturer, const char *product, + SMBIOS_SET_DEFAULT(type1.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type1.product, product); + SMBIOS_SET_DEFAULT(type1.version, version); ++ SMBIOS_SET_DEFAULT(type1.family, "Red Hat Enterprise Linux"); + SMBIOS_SET_DEFAULT(type2.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type2.product, product); + SMBIOS_SET_DEFAULT(type2.version, version); +diff --git a/hw/timer/i8254_common.c b/hw/timer/i8254_common.c +index 050875b497..32935da46c 100644 +--- a/hw/timer/i8254_common.c ++++ b/hw/timer/i8254_common.c +@@ -231,7 +231,7 @@ static const VMStateDescription vmstate_pit_common = { + .pre_save = pit_dispatch_pre_save, + .post_load = pit_dispatch_post_load, + .fields = (VMStateField[]) { +- VMSTATE_UINT32_V(channels[0].irq_disabled, PITCommonState, 3), ++ VMSTATE_UINT32(channels[0].irq_disabled, PITCommonState), /* qemu-kvm's v2 had 'flags' here */ + VMSTATE_STRUCT_ARRAY(channels, PITCommonState, 3, 2, + vmstate_pit_channel, PITChannelState), + VMSTATE_INT64(channels[0].next_transition_time, +diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c +index 23507ad3b5..9fd87a7ad9 100644 +--- a/hw/usb/hcd-uhci.c ++++ b/hw/usb/hcd-uhci.c +@@ -1219,12 +1219,14 @@ static void usb_uhci_common_realize(PCIDevice *dev, Error **errp) + UHCIState *s = UHCI(dev); + uint8_t *pci_conf = s->dev.config; + int i; ++ int irq_pin; + + pci_conf[PCI_CLASS_PROG] = 0x00; + /* TODO: reset value should be 0. */ + pci_conf[USB_SBRN] = USB_RELEASE_1; // release number + +- pci_config_set_interrupt_pin(pci_conf, u->info.irq_pin + 1); ++ irq_pin = u->info.irq_pin; ++ pci_config_set_interrupt_pin(pci_conf, irq_pin + 1); + + if (s->masterbus) { + USBPort *ports[NB_PORTS]; +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index 80988bb305..8fed2eedd6 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -3590,9 +3590,27 @@ static const VMStateDescription vmstate_xhci_slot = { + } + }; + ++static int xhci_event_pre_save(void *opaque) ++{ ++ XHCIEvent *s = opaque; ++ ++ s->cve_2014_5263_a = ((uint8_t *)&s->type)[0]; ++ s->cve_2014_5263_b = ((uint8_t *)&s->type)[1]; ++ ++ return 0; ++} ++ ++bool migrate_cve_2014_5263_xhci_fields; ++ ++static bool xhci_event_cve_2014_5263(void *opaque, int version_id) ++{ ++ return migrate_cve_2014_5263_xhci_fields; ++} ++ + static const VMStateDescription vmstate_xhci_event = { + .name = "xhci-event", + .version_id = 1, ++ .pre_save = xhci_event_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(type, XHCIEvent), + VMSTATE_UINT32(ccode, XHCIEvent), +@@ -3601,6 +3619,8 @@ static const VMStateDescription vmstate_xhci_event = { + VMSTATE_UINT32(flags, XHCIEvent), + VMSTATE_UINT8(slotid, XHCIEvent), + VMSTATE_UINT8(epid, XHCIEvent), ++ VMSTATE_UINT8_TEST(cve_2014_5263_a, XHCIEvent, xhci_event_cve_2014_5263), ++ VMSTATE_UINT8_TEST(cve_2014_5263_b, XHCIEvent, xhci_event_cve_2014_5263), + VMSTATE_END_OF_LIST() + } + }; +diff --git a/hw/usb/hcd-xhci.h b/hw/usb/hcd-xhci.h +index 2fad4df2a7..f554b671e3 100644 +--- a/hw/usb/hcd-xhci.h ++++ b/hw/usb/hcd-xhci.h +@@ -157,6 +157,8 @@ typedef struct XHCIEvent { + uint32_t flags; + uint8_t slotid; + uint8_t epid; ++ uint8_t cve_2014_5263_a; ++ uint8_t cve_2014_5263_b; + } XHCIEvent; + + typedef struct XHCIInterrupter { +diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h +index 41568d1837..1a23ccc412 100644 +--- a/include/hw/acpi/ich9.h ++++ b/include/hw/acpi/ich9.h +@@ -61,6 +61,9 @@ typedef struct ICH9LPCPMRegs { + uint8_t smm_enabled; + bool enable_tco; + TCOIORegs tco_regs; ++ ++ /* RH addition, see bz 1489800 */ ++ bool force_rev1_fadt; + } ICH9LPCPMRegs; + + #define ACPI_PM_PROP_TCO_ENABLED "enable_tco" +diff --git a/include/hw/boards.h b/include/hw/boards.h +index de45087f34..6f85a0e032 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -377,4 +377,28 @@ extern const size_t hw_compat_2_2_len; + extern GlobalProperty hw_compat_2_1[]; + extern const size_t hw_compat_2_1_len; + ++extern GlobalProperty hw_compat_rhel_8_1[]; ++extern const size_t hw_compat_rhel_8_1_len; ++ ++extern GlobalProperty hw_compat_rhel_8_0[]; ++extern const size_t hw_compat_rhel_8_0_len; ++ ++extern GlobalProperty hw_compat_rhel_7_6[]; ++extern const size_t hw_compat_rhel_7_6_len; ++ ++extern GlobalProperty hw_compat_rhel_7_5[]; ++extern const size_t hw_compat_rhel_7_5_len; ++ ++extern GlobalProperty hw_compat_rhel_7_4[]; ++extern const size_t hw_compat_rhel_7_4_len; ++ ++extern GlobalProperty hw_compat_rhel_7_3[]; ++extern const size_t hw_compat_rhel_7_3_len; ++ ++extern GlobalProperty hw_compat_rhel_7_2[]; ++extern const size_t hw_compat_rhel_7_2_len; ++ ++extern GlobalProperty hw_compat_rhel_7_1[]; ++extern const size_t hw_compat_rhel_7_1_len; ++ + #endif +diff --git a/include/hw/usb.h b/include/hw/usb.h +index c24d968a19..b353438ea0 100644 +--- a/include/hw/usb.h ++++ b/include/hw/usb.h +@@ -605,4 +605,8 @@ int usb_get_quirks(uint16_t vendor_id, uint16_t product_id, + uint8_t interface_class, uint8_t interface_subclass, + uint8_t interface_protocol); + ++ ++/* hcd-xhci.c -- rhel7.0.0 machine type compatibility */ ++extern bool migrate_cve_2014_5263_xhci_fields; ++ + #endif +diff --git a/migration/migration.c b/migration/migration.c +index 354ad072fa..30c53c623b 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -121,6 +121,8 @@ enum mig_rp_message_type { + MIG_RP_MSG_MAX + }; + ++bool migrate_pre_2_2; ++ + /* When we add fault tolerance, we could have several + migrations at once. For now we don't need to add + dynamic creation of migration */ +diff --git a/migration/migration.h b/migration/migration.h +index 79b3dda146..0b1b0d4df5 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -335,6 +335,11 @@ void init_dirty_bitmap_incoming_migration(void); + void migrate_add_address(SocketAddress *address); + + int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque); ++/* ++ * Disables a load of subsections that were added in 2.2/rh7.2 for backwards ++ * migration compatibility. ++ */ ++extern bool migrate_pre_2_2; + + #define qemu_ram_foreach_block \ + #warning "Use foreach_not_ignored_block in migration code" +-- +2.21.0 + diff --git a/0008-Add-aarch64-machine-types.patch b/0008-Add-aarch64-machine-types.patch new file mode 100755 index 0000000..5397c8b --- /dev/null +++ b/0008-Add-aarch64-machine-types.patch @@ -0,0 +1,276 @@ +From 49164264d9928f73961acbbe4d56d8dfa23d8099 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 12:53:31 +0200 +Subject: Add aarch64 machine types + +Adding changes to add RHEL machine types for aarch64 architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- Use upstream compat handling + +Rebase changes (4.1.0-rc0): +- Removed a15memmap (upstream) +- Use virt_flash_create in rhel800_virt_instance_init + +Rebase changes (4.2.0-rc0): +- Set numa_mem_supported + +Rebase notes (4.2.0-rc3): +- aarch64: Add virt-rhel8.2.0 machine type for ARM (patch 92246) +- aarch64: virt: Allow more than 1TB of RAM (patch 92249) +- aarch64: virt: Allow PCDIMM instantiation (patch 92247) +- aarch64: virt: Enhance the comment related to gic-version (patch 92248) + +Merged patches (4.0.0): +- 7bfdb4c aarch64: Add virt-rhel8.0.0 machine type for ARM +- 3433e69 aarch64: Set virt-rhel8.0.0 max_cpus to 512 +- 4d20863 aarch64: Use 256MB ECAM region by default + +Merged patches (4.1.0): +- c3e39ef aarch64: Add virt-rhel8.1.0 machine type for ARM +- 59a46d1 aarch64: Allow ARM VIRT iommu option in RHEL8.1 machine + +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/virt.c | 161 +++++++++++++++++++++++++++++++++++++++++- + include/hw/arm/virt.h | 11 +++ + 2 files changed, 171 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index d4bedc2607..e10839100e 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -72,6 +72,7 @@ + #include "hw/mem/nvdimm.h" + #include "hw/acpi/generic_event_device.h" + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ + static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ + void *data) \ +@@ -98,7 +99,49 @@ + DEFINE_VIRT_MACHINE_LATEST(major, minor, true) + #define DEFINE_VIRT_MACHINE(major, minor) \ + DEFINE_VIRT_MACHINE_LATEST(major, minor, false) +- ++#endif /* disabled for RHEL */ ++ ++#define DEFINE_RHEL_MACHINE_LATEST(m, n, s, latest) \ ++ static void rhel##m##n##s##_virt_class_init(ObjectClass *oc, \ ++ void *data) \ ++ { \ ++ MachineClass *mc = MACHINE_CLASS(oc); \ ++ rhel##m##n##s##_virt_options(mc); \ ++ mc->desc = "RHEL " # m "." # n "." # s " ARM Virtual Machine"; \ ++ if (latest) { \ ++ mc->alias = "virt"; \ ++ mc->is_default = 1; \ ++ } \ ++ } \ ++ static const TypeInfo rhel##m##n##s##_machvirt_info = { \ ++ .name = MACHINE_TYPE_NAME("virt-rhel" # m "." # n "." # s), \ ++ .parent = TYPE_RHEL_MACHINE, \ ++ .instance_init = rhel##m##n##s##_virt_instance_init, \ ++ .class_init = rhel##m##n##s##_virt_class_init, \ ++ }; \ ++ static void rhel##m##n##s##_machvirt_init(void) \ ++ { \ ++ type_register_static(&rhel##m##n##s##_machvirt_info); \ ++ } \ ++ type_init(rhel##m##n##s##_machvirt_init); ++ ++#define DEFINE_RHEL_MACHINE_AS_LATEST(major, minor, subminor) \ ++ DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, true) ++#define DEFINE_RHEL_MACHINE(major, minor, subminor) \ ++ DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, false) ++ ++/* This variable is for changes to properties that are RHEL specific, ++ * different to the current upstream and to be applied to the latest ++ * machine type. ++ */ ++GlobalProperty arm_rhel_compat[] = { ++ { ++ .driver = "virtio-net-pci", ++ .property = "romfile", ++ .value = "", ++ }, ++}; ++const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat); + + /* Number of external interrupt lines to configure the GIC with */ + #define NUM_IRQS 256 +@@ -1763,6 +1806,7 @@ static void machvirt_init(MachineState *machine) + qemu_add_machine_init_done_notifier(&vms->machine_done); + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static bool virt_get_secure(Object *obj, Error **errp) + { + VirtMachineState *vms = VIRT_MACHINE(obj); +@@ -1791,6 +1835,7 @@ static void virt_set_virt(Object *obj, bool value, Error **errp) + vms->virt = value; + } + ++#endif /* disabled for RHEL */ + static bool virt_get_highmem(Object *obj, Error **errp) + { + VirtMachineState *vms = VIRT_MACHINE(obj); +@@ -2022,6 +2067,7 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) + return requested_pa_size > 40 ? requested_pa_size : 0; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void virt_machine_class_init(ObjectClass *oc, void *data) + { + MachineClass *mc = MACHINE_CLASS(oc); +@@ -2258,3 +2304,116 @@ static void virt_machine_2_6_options(MachineClass *mc) + vmc->no_pmu = true; + } + DEFINE_VIRT_MACHINE(2, 6) ++#endif /* disabled for RHEL */ ++ ++static void rhel_machine_class_init(ObjectClass *oc, void *data) ++{ ++ MachineClass *mc = MACHINE_CLASS(oc); ++ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); ++ ++ mc->family = "virt-rhel-Z"; ++ mc->init = machvirt_init; ++ /* Start with max_cpus set to 512, which is the maximum supported by KVM. ++ * The value may be reduced later when we have more information about the ++ * configuration of the particular instance. ++ */ ++ mc->max_cpus = 512; ++ mc->block_default_type = IF_VIRTIO; ++ mc->no_cdrom = 1; ++ mc->pci_allow_0_address = true; ++ /* We know we will never create a pre-ARMv7 CPU which needs 1K pages */ ++ mc->minimum_page_bits = 12; ++ mc->possible_cpu_arch_ids = virt_possible_cpu_arch_ids; ++ mc->cpu_index_to_instance_props = virt_cpu_index_to_props; ++ mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-a57"); ++ mc->get_default_cpu_node_id = virt_get_default_cpu_node_id; ++ mc->kvm_type = virt_kvm_type; ++ assert(!mc->get_hotplug_handler); ++ mc->get_hotplug_handler = virt_machine_get_hotplug_handler; ++ hc->pre_plug = virt_machine_device_pre_plug_cb; ++ hc->plug = virt_machine_device_plug_cb; ++ hc->unplug_request = virt_machine_device_unplug_request_cb; ++ mc->numa_mem_supported = true; ++ mc->auto_enable_numa_with_memhp = true; ++} ++ ++static const TypeInfo rhel_machine_info = { ++ .name = TYPE_RHEL_MACHINE, ++ .parent = TYPE_MACHINE, ++ .abstract = true, ++ .instance_size = sizeof(VirtMachineState), ++ .class_size = sizeof(VirtMachineClass), ++ .class_init = rhel_machine_class_init, ++ .interfaces = (InterfaceInfo[]) { ++ { TYPE_HOTPLUG_HANDLER }, ++ { } ++ }, ++}; ++ ++static void rhel_machine_init(void) ++{ ++ type_register_static(&rhel_machine_info); ++} ++type_init(rhel_machine_init); ++ ++static void rhel820_virt_instance_init(Object *obj) ++{ ++ VirtMachineState *vms = VIRT_MACHINE(obj); ++ VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); ++ ++ /* EL3 is disabled by default and non-configurable for RHEL */ ++ vms->secure = false; ++ /* EL2 is disabled by default and non-configurable for RHEL */ ++ vms->virt = false; ++ /* High memory is enabled by default for RHEL */ ++ vms->highmem = true; ++ object_property_add_bool(obj, "highmem", virt_get_highmem, ++ virt_set_highmem, NULL); ++ object_property_set_description(obj, "highmem", ++ "Set on/off to enable/disable using " ++ "physical address space above 32 bits", ++ NULL); ++ /* ++ * Default GIC type is still v2, but became configurable for RHEL. We ++ * keep v2 instead of max as TCG CI test cases require an MSI controller ++ * and there is no userspace ITS MSI emulation available. ++ */ ++ vms->gic_version = 2; ++ object_property_add_str(obj, "gic-version", virt_get_gic_version, ++ virt_set_gic_version, NULL); ++ object_property_set_description(obj, "gic-version", ++ "Set GIC version. " ++ "Valid values are 2, 3 and host", NULL); ++ ++ vms->highmem_ecam = !vmc->no_highmem_ecam; ++ ++ if (vmc->no_its) { ++ vms->its = false; ++ } else { ++ /* Default allows ITS instantiation */ ++ vms->its = true; ++ object_property_add_bool(obj, "its", virt_get_its, ++ virt_set_its, NULL); ++ object_property_set_description(obj, "its", ++ "Set on/off to enable/disable " ++ "ITS instantiation", ++ NULL); ++ } ++ ++ /* Default disallows iommu instantiation */ ++ vms->iommu = VIRT_IOMMU_NONE; ++ object_property_add_str(obj, "iommu", virt_get_iommu, virt_set_iommu, NULL); ++ object_property_set_description(obj, "iommu", ++ "Set the IOMMU type. " ++ "Valid values are none and smmuv3", ++ NULL); ++ ++ vms->irqmap=a15irqmap; ++ virt_flash_create(vms); ++} ++ ++static void rhel820_virt_options(MachineClass *mc) ++{ ++ compat_props_add(mc->compat_props, arm_rhel_compat, arm_rhel_compat_len); ++} ++DEFINE_RHEL_MACHINE_AS_LATEST(8, 2, 0) +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index 0b41083e9d..53fdf16563 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -142,6 +142,7 @@ typedef struct { + + #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) + ++#if 0 /* disabled for Red Hat Enterprise Linux */ + #define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt") + #define VIRT_MACHINE(obj) \ + OBJECT_CHECK(VirtMachineState, (obj), TYPE_VIRT_MACHINE) +@@ -150,6 +151,16 @@ typedef struct { + #define VIRT_MACHINE_CLASS(klass) \ + OBJECT_CLASS_CHECK(VirtMachineClass, klass, TYPE_VIRT_MACHINE) + ++#else ++#define TYPE_RHEL_MACHINE MACHINE_TYPE_NAME("virt-rhel") ++#define VIRT_MACHINE(obj) \ ++ OBJECT_CHECK(VirtMachineState, (obj), TYPE_RHEL_MACHINE) ++#define VIRT_MACHINE_GET_CLASS(obj) \ ++ OBJECT_GET_CLASS(VirtMachineClass, obj, TYPE_RHEL_MACHINE) ++#define VIRT_MACHINE_CLASS(klass) \ ++ OBJECT_CLASS_CHECK(VirtMachineClass, klass, TYPE_RHEL_MACHINE) ++#endif ++ + void virt_acpi_setup(VirtMachineState *vms); + + /* Return the number of used redistributor regions */ +-- +2.21.0 + diff --git a/0009-Add-ppc64-machine-types.patch b/0009-Add-ppc64-machine-types.patch new file mode 100755 index 0000000..a3f1a54 --- /dev/null +++ b/0009-Add-ppc64-machine-types.patch @@ -0,0 +1,463 @@ +From 136eae41007e2e5b0d693cc656f3ec36cbabf16f Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:27:13 +0200 +Subject: Add ppc64 machine types + +Adding changes to add RHEL machine types for ppc64 architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- remove instance options and use upstream solution +- Use upstream compat handling +- Replace SPAPR_PCI_2_7_MMIO_WIN_SIZE with value (changed upstream) +- re-add handling of instance_options (removed upstream) +- Use p8 as default for rhel machine types (p9 default upstream) +- sPAPRMachineClass renamed to SpaprMachineClass (upstream) + +Rebase changes (4.1.0): +- Update format for compat structures + +Merged patches (4.0.0): +- 467d59a redhat: define pseries-rhel8.0.0 machine type + +Merged patches (4.1.0): +- f21757edc target/ppc/spapr: Enable mitigations by default for pseries-4.0 machine type +- 2511c63 redhat: sync pseries-rhel7.6.0 with rhel-av-8.0.1 +- 89f01da redhat: define pseries-rhel8.1.0 machine type + +Merged patches (4.2.0): +- bcba728 redhat: update pseries-rhel8.1.0 machine type +- redhat: update pseries-rhel-7.6.0 machine type (patch 93039) +- redhat: define pseries-rhel8.2.0 machine type (patch 93041) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 278 ++++++++++++++++++++++++++++++++++++++++ + hw/ppc/spapr_cpu_core.c | 13 ++ + include/hw/ppc/spapr.h | 1 + + target/ppc/compat.c | 13 +- + target/ppc/cpu.h | 1 + + 5 files changed, 305 insertions(+), 1 deletion(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index e076f6023c..8749c72066 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -4447,6 +4447,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) + smc->linux_pci_probe = true; + smc->smp_threads_vsmt = true; + smc->nr_xirqs = SPAPR_NR_XIRQS; ++ smc->has_power9_support = true; + } + + static const TypeInfo spapr_machine_info = { +@@ -4491,6 +4492,7 @@ static const TypeInfo spapr_machine_info = { + } \ + type_init(spapr_machine_register_##suffix) + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + /* + * pseries-4.2 + */ +@@ -4520,6 +4522,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc) + } + + DEFINE_SPAPR_MACHINE(4_1, "4.1", false); ++#endif + + /* + * pseries-4.0 +@@ -4536,6 +4539,7 @@ static void phb_placement_4_0(SpaprMachineState *spapr, uint32_t index, + *nv2atsd = 0; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void spapr_machine_4_0_class_options(MachineClass *mc) + { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); +@@ -4695,6 +4699,7 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false); + /* + * pseries-2.7 + */ ++#endif + + static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index, + uint64_t *buid, hwaddr *pio, +@@ -4749,6 +4754,7 @@ static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index, + *nv2atsd = 0; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void spapr_machine_2_7_class_options(MachineClass *mc) + { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); +@@ -4863,6 +4869,278 @@ static void spapr_machine_2_1_class_options(MachineClass *mc) + compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len); + } + DEFINE_SPAPR_MACHINE(2_1, "2.1", false); ++#endif ++ ++/* ++ * pseries-rhel8.2.0 ++ */ ++ ++static void spapr_machine_rhel820_class_options(MachineClass *mc) ++{ ++ /* Defaults for the latest behaviour inherited from the base class */ ++} ++ ++DEFINE_SPAPR_MACHINE(rhel820, "rhel8.2.0", true); ++ ++/* ++ * pseries-rhel8.1.0 ++ * like pseries-4.1 ++ */ ++ ++static void spapr_machine_rhel810_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ static GlobalProperty compat[] = { ++ /* Only allow 4kiB and 64kiB IOMMU pagesizes */ ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" }, ++ }; ++ ++ spapr_machine_rhel820_class_options(mc); ++ ++ /* from pseries-4.1 */ ++ smc->linux_pci_probe = false; ++ smc->smp_threads_vsmt = false; ++ compat_props_add(mc->compat_props, hw_compat_rhel_8_1, ++ hw_compat_rhel_8_1_len); ++ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); ++ ++} ++ ++DEFINE_SPAPR_MACHINE(rhel810, "rhel8.1.0", false); ++ ++/* ++ * pseries-rhel8.0.0 ++ * like pseries-3.1 and pseries-4.0 ++ * except SPAPR_CAP_CFPC, SPAPR_CAP_SBBC and SPAPR_CAP_IBS ++ * that have been backported to pseries-rhel8.0.0 ++ */ ++ ++static void spapr_machine_rhel800_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel810_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_8_0, ++ hw_compat_rhel_8_0_len); ++ ++ /* pseries-4.0 */ ++ smc->phb_placement = phb_placement_4_0; ++ smc->irq = &spapr_irq_xics; ++ smc->pre_4_1_migration = true; ++ ++ /* pseries-3.1 */ ++ mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0"); ++ smc->update_dt_enabled = false; ++ smc->dr_phb_enabled = false; ++ smc->broken_host_serial_model = true; ++ smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel800, "rhel8.0.0", false); ++ ++/* ++ * pseries-rhel7.6.0 ++ * like spapr_compat_2_12 and spapr_compat_3_0 ++ * spapr_compat_0 is empty ++ */ ++GlobalProperty spapr_compat_rhel7_6[] = { ++ { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" }, ++ { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" }, ++}; ++const size_t spapr_compat_rhel7_6_len = G_N_ELEMENTS(spapr_compat_rhel7_6); ++ ++ ++static void spapr_machine_rhel760_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel800_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_6, spapr_compat_rhel7_6_len); ++ ++ /* from spapr_machine_3_0_class_options() */ ++ smc->legacy_irq_allocation = true; ++ smc->nr_xirqs = 0x400; ++ smc->irq = &spapr_irq_xics_legacy; ++ ++ /* from spapr_machine_2_12_class_options() */ ++ /* We depend on kvm_enabled() to choose a default value for the ++ * hpt-max-page-size capability. Of course we can't do it here ++ * because this is too early and the HW accelerator isn't initialzed ++ * yet. Postpone this to machine init (see default_caps_with_cpu()). ++ */ ++ smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0; ++ ++ /* SPAPR_CAP_WORKAROUND enabled in pseries-rhel800 by ++ * f21757edc554 ++ * "Enable mitigations by default for pseries-4.0 machine type") ++ */ ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel760, "rhel7.6.0", false); ++ ++/* ++ * pseries-rhel7.6.0-sxxm ++ * ++ * pseries-rhel7.6.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel760sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel760_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel760sxxm, "rhel7.6.0-sxxm", false); ++ ++static void spapr_machine_rhel750_class_options(MachineClass *mc) ++{ ++ spapr_machine_rhel760_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ ++} ++ ++DEFINE_SPAPR_MACHINE(rhel750, "rhel7.5.0", false); ++ ++/* ++ * pseries-rhel7.5.0-sxxm ++ * ++ * pseries-rhel7.5.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel750sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel750_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel750sxxm, "rhel7.5.0-sxxm", false); ++ ++/* ++ * pseries-rhel7.4.0 ++ * like spapr_compat_2_9 ++ */ ++GlobalProperty spapr_compat_rhel7_4[] = { ++ { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" }, ++}; ++const size_t spapr_compat_rhel7_4_len = G_N_ELEMENTS(spapr_compat_rhel7_4); ++ ++static void spapr_machine_rhel740_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel750_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_4, hw_compat_rhel_7_4_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_4, spapr_compat_rhel7_4_len); ++ mc->numa_auto_assign_ram = numa_legacy_auto_assign_ram; ++ smc->has_power9_support = false; ++ smc->pre_2_10_has_unused_icps = true; ++ smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED; ++ smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel740, "rhel7.4.0", false); ++ ++/* ++ * pseries-rhel7.4.0-sxxm ++ * ++ * pseries-rhel7.4.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel740sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel740_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel740sxxm, "rhel7.4.0-sxxm", false); ++ ++/* ++ * pseries-rhel7.3.0 ++ * like spapr_compat_2_6/_2_7/_2_8 but "ddw" has been backported to RHEL7_3 ++ */ ++GlobalProperty spapr_compat_rhel7_3[] = { ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0" }, ++ { TYPE_POWERPC_CPU, "pre-2.8-migration", "on" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" }, ++}; ++const size_t spapr_compat_rhel7_3_len = G_N_ELEMENTS(spapr_compat_rhel7_3); ++ ++static void spapr_machine_rhel730_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel740_class_options(mc); ++ mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3"); ++ mc->default_machine_opts = "modern-hotplug-events=off"; ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_3, hw_compat_rhel_7_3_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_3, spapr_compat_rhel7_3_len); ++ ++ smc->phb_placement = phb_placement_2_7; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel730, "rhel7.3.0", false); ++ ++/* ++ * pseries-rhel7.3.0-sxxm ++ * ++ * pseries-rhel7.3.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel730sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel730_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel730sxxm, "rhel7.3.0-sxxm", false); ++ ++/* ++ * pseries-rhel7.2.0 ++ */ ++/* Should be like spapr_compat_2_5 + 2_4 + 2_3, but "dynamic-reconfiguration" ++ * has been backported to RHEL7_2 so we don't need it here. ++ */ ++ ++GlobalProperty spapr_compat_rhel7_2[] = { ++ { "spapr-vlan", "use-rx-buffer-pools", "off" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" }, ++}; ++const size_t spapr_compat_rhel7_2_len = G_N_ELEMENTS(spapr_compat_rhel7_2); ++ ++static void spapr_machine_rhel720_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel730_class_options(mc); ++ smc->use_ohci_by_default = true; ++ mc->has_hotpluggable_cpus = NULL; ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_2, hw_compat_rhel_7_2_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_2, spapr_compat_rhel7_2_len); ++} ++ ++DEFINE_SPAPR_MACHINE(rhel720, "rhel7.2.0", false); + + static void spapr_machine_register_types(void) + { +diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c +index 301cd7b4e4..ba5a8fb82b 100644 +--- a/hw/ppc/spapr_cpu_core.c ++++ b/hw/ppc/spapr_cpu_core.c +@@ -24,6 +24,7 @@ + #include "sysemu/reset.h" + #include "sysemu/hw_accel.h" + #include "qemu/error-report.h" ++#include "cpu-models.h" + + static void spapr_reset_vcpu(PowerPCCPU *cpu) + { +@@ -242,6 +243,7 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, + CPUPPCState *env = &cpu->env; + CPUState *cs = CPU(cpu); + Error *local_err = NULL; ++ SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + + object_property_set_bool(OBJECT(cpu), true, "realized", &local_err); + if (local_err) { +@@ -254,6 +256,17 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, + cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr)); + kvmppc_set_papr(cpu); + ++ if (!smc->has_power9_support && ++ (((spapr->max_compat_pvr && ++ ppc_compat_cmp(spapr->max_compat_pvr, ++ CPU_POWERPC_LOGICAL_3_00) >= 0)) || ++ (!spapr->max_compat_pvr && ++ ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, 0)))) { ++ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, ++ "POWER9 CPU is not supported by this machine class"); ++ return; ++ } ++ + if (spapr_irq_cpu_intc_create(spapr, cpu, &local_err) < 0) { + goto error_intc_create; + } +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index d5ab5ea7b2..aa89cc4a95 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -125,6 +125,7 @@ struct SpaprMachineClass { + bool linux_pci_probe; + bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ + ++ bool has_power9_support; + void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, + uint64_t *buid, hwaddr *pio, + hwaddr *mmio32, hwaddr *mmio64, +diff --git a/target/ppc/compat.c b/target/ppc/compat.c +index 7de4bf3122..3e2e35342d 100644 +--- a/target/ppc/compat.c ++++ b/target/ppc/compat.c +@@ -105,8 +105,19 @@ static const CompatInfo *compat_by_pvr(uint32_t pvr) + return NULL; + } + ++long ppc_compat_cmp(uint32_t pvr1, uint32_t pvr2) ++{ ++ const CompatInfo *compat1 = compat_by_pvr(pvr1); ++ const CompatInfo *compat2 = compat_by_pvr(pvr2); ++ ++ g_assert(compat1); ++ g_assert(compat2); ++ ++ return compat1 - compat2; ++} ++ + static bool pcc_compat(PowerPCCPUClass *pcc, uint32_t compat_pvr, +- uint32_t min_compat_pvr, uint32_t max_compat_pvr) ++ uint32_t min_compat_pvr, uint32_t max_compat_pvr) + { + const CompatInfo *compat = compat_by_pvr(compat_pvr); + const CompatInfo *min = compat_by_pvr(min_compat_pvr); +diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h +index e3e82327b7..5c53801cfd 100644 +--- a/target/ppc/cpu.h ++++ b/target/ppc/cpu.h +@@ -1367,6 +1367,7 @@ static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch) + + /* Compatibility modes */ + #if defined(TARGET_PPC64) ++long ppc_compat_cmp(uint32_t pvr1, uint32_t pvr2); + bool ppc_check_compat(PowerPCCPU *cpu, uint32_t compat_pvr, + uint32_t min_compat_pvr, uint32_t max_compat_pvr); + bool ppc_type_check_compat(const char *cputype, uint32_t compat_pvr, +-- +2.21.0 + diff --git a/0010-Add-s390x-machine-types.patch b/0010-Add-s390x-machine-types.patch new file mode 100755 index 0000000..d0f6669 --- /dev/null +++ b/0010-Add-s390x-machine-types.patch @@ -0,0 +1,126 @@ +From 0842700b3a01891c316e9169fa651f26714cafa5 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:47:32 +0200 +Subject: Add s390x machine types + +Adding changes to add RHEL machine types for s390x architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (weekly-4.1.0): +- Use upstream compat handling + +Merged patches (3.1.0): +- 29df663 s390x/cpumodel: default enable bpb and ppa15 for z196 and later + +Merged patches (4.1.0): +- 6c200d665b hw/s390x/s390-virtio-ccw: Add machine types for RHEL8.0.0 + +Merged patches (4.2.0): +- fb192e5 redhat: s390x: Rename s390-ccw-virtio-rhel8.0.0 to s390-ccw-virtio-rhel8.1.0 +- a9b22e8 redhat: s390x: Add proper compatibility options for the -rhel7.6.0 machine +- hw/s390x: Add the s390-ccw-virtio-rhel8.2.0 machine types (patch 92954) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 70 +++++++++++++++++++++++++++++++++++++- + 1 file changed, 69 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index d3edeef0ad..c2c83d2fce 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -615,7 +615,7 @@ bool css_migration_enabled(void) + { \ + MachineClass *mc = MACHINE_CLASS(oc); \ + ccw_machine_##suffix##_class_options(mc); \ +- mc->desc = "VirtIO-ccw based S390 machine v" verstr; \ ++ mc->desc = "VirtIO-ccw based S390 machine " verstr; \ + if (latest) { \ + mc->alias = "s390-ccw-virtio"; \ + mc->is_default = 1; \ +@@ -639,6 +639,7 @@ bool css_migration_enabled(void) + } \ + type_init(ccw_machine_register_##suffix) + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void ccw_machine_4_2_instance_options(MachineState *machine) + { + } +@@ -866,6 +867,73 @@ static void ccw_machine_2_4_class_options(MachineClass *mc) + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); + } + DEFINE_CCW_MACHINE(2_4, "2.4", false); ++#endif ++ ++static void ccw_machine_rhel820_instance_options(MachineState *machine) ++{ ++} ++ ++static void ccw_machine_rhel820_class_options(MachineClass *mc) ++{ ++} ++DEFINE_CCW_MACHINE(rhel820, "rhel8.2.0", true); ++ ++static void ccw_machine_rhel760_instance_options(MachineState *machine) ++{ ++ static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V3_1 }; ++ ++ ccw_machine_rhel820_instance_options(machine); ++ ++ s390_set_qemu_cpu_model(0x2827, 12, 2, qemu_cpu_feat); ++ ++ /* The multiple-epoch facility was not available with rhel7.6.0 on z14GA1 */ ++ s390_cpudef_featoff(14, 1, S390_FEAT_MULTIPLE_EPOCH); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_QSIE); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_QTOUE); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_STOE); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_STOUE); ++} ++ ++static void ccw_machine_rhel760_class_options(MachineClass *mc) ++{ ++ ccw_machine_rhel820_class_options(mc); ++ /* We never published the s390x version of RHEL-AV 8.0 and 8.1, so add this here */ ++ compat_props_add(mc->compat_props, hw_compat_rhel_8_0, hw_compat_rhel_8_0_len); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++} ++DEFINE_CCW_MACHINE(rhel760, "rhel7.6.0", false); ++ ++static void ccw_machine_rhel750_instance_options(MachineState *machine) ++{ ++ static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V2_11 }; ++ ccw_machine_rhel760_instance_options(machine); ++ ++ /* before 2.12 we emulated the very first z900, and RHEL 7.5 is ++ based on 2.10 */ ++ s390_set_qemu_cpu_model(0x2064, 7, 1, qemu_cpu_feat); ++ ++ /* bpb and ppa15 were only in the full model in RHEL 7.5 */ ++ s390_cpudef_featoff_greater(11, 1, S390_FEAT_PPA15); ++ s390_cpudef_featoff_greater(11, 1, S390_FEAT_BPB); ++} ++ ++GlobalProperty ccw_compat_rhel_7_5[] = { ++ { ++ .driver = TYPE_SCLP_EVENT_FACILITY, ++ .property = "allow_all_mask_sizes", ++ .value = "off", ++ }, ++}; ++const size_t ccw_compat_rhel_7_5_len = G_N_ELEMENTS(ccw_compat_rhel_7_5); ++ ++static void ccw_machine_rhel750_class_options(MachineClass *mc) ++{ ++ ccw_machine_rhel760_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ compat_props_add(mc->compat_props, ccw_compat_rhel_7_5, ccw_compat_rhel_7_5_len); ++ S390_MACHINE_CLASS(mc)->hpage_1m_allowed = false; ++} ++DEFINE_CCW_MACHINE(rhel750, "rhel7.5.0", false); + + static void ccw_machine_register_types(void) + { +-- +2.21.0 + diff --git a/0011-Add-x86_64-machine-types.patch b/0011-Add-x86_64-machine-types.patch new file mode 100755 index 0000000..72a5159 --- /dev/null +++ b/0011-Add-x86_64-machine-types.patch @@ -0,0 +1,897 @@ +From 2ebaeca6e26950f401a8169d1324be2bafd11741 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:10:31 +0200 +Subject: Add x86_64 machine types + +Adding changes to add RHEL machine types for x86_64 architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (qemu-4.0.0): +- Use upstream compat handling + +Rebase notes (3.1.0): +- Removed xsave changes + +Rebase notes (4.1.0): +- Updated format for compat structures + +Rebase notes (4.2.0-rc2): +- Use X86MachineClass for save_tsc_khz (upstream change) + +Merged patches (4.1.0): +- f4dc802 pc: 7.5 compat entries +- 456ed3e pc: PC_RHEL7_6_COMPAT +- 04119ee pc: Add compat for pc-i440fx-rhel7.6.0 machine type +- b3b3687 pc: Add pc-q35-8.0.0 machine type +- 8d46fc6 pc: Add x-migrate-smi-count=off to PC_RHEL7_6_COMPAT +- 1de7949 kvm: clear out KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT for older machine types +- 18cf0d7 target/i386: Disable MPX support on named CPU models (partialy) +- 2660667 rhel: Set host-phys-bits-limit=48 on rhel machine-types + +Merged patches (4.2.0): +- 7d5c2ef pc: Don't make die-id mandatory unless necessary +- e42808c x86 machine types: pc_rhel_8_0_compat +- 9de83a8 x86 machine types: q35: Fixup units_per_default_bus +- 6df1559 x86 machine types: Fixup dynamic sysbus entries +- 0784125 x86 machine types: add pc-q35-rhel8.1.0 +- machines/x86: Add rhel 8.2 machine type (patch 92959) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/i386/acpi-build.c | 3 + + hw/i386/pc.c | 263 ++++++++++++++++++++++++++++++++++++++++++- + hw/i386/pc_piix.c | 210 +++++++++++++++++++++++++++++++++- + hw/i386/pc_q35.c | 156 ++++++++++++++++++++++++- + include/hw/boards.h | 2 + + include/hw/i386/pc.h | 33 ++++++ + target/i386/cpu.c | 9 +- + target/i386/kvm.c | 4 + + 8 files changed, 673 insertions(+), 7 deletions(-) + +diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c +index 12ff55fcfb..64001893ab 100644 +--- a/hw/i386/acpi-build.c ++++ b/hw/i386/acpi-build.c +@@ -204,6 +204,9 @@ static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm) + pm->fadt.reset_reg = r; + pm->fadt.reset_val = 0xf; + pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP; ++ if (object_property_get_bool(lpc, ++ "__com.redhat_force-rev1-fadt", NULL)) ++ pm->fadt.rev = 1; + pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE; + } + +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index ac08e63604..61e70e4811 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -344,6 +344,261 @@ GlobalProperty pc_compat_1_4[] = { + }; + const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4); + ++/* This macro is for changes to properties that are RHEL specific, ++ * different to the current upstream and to be applied to the latest ++ * machine type. ++ */ ++GlobalProperty pc_rhel_compat[] = { ++ { TYPE_X86_CPU, "host-phys-bits", "on" }, ++ { TYPE_X86_CPU, "host-phys-bits-limit", "48" }, ++ /* bz 1508330 */ ++ { "vfio-pci", "x-no-geforce-quirks", "on" }, ++}; ++const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat); ++ ++/* pc_rhel_8_1_compat is empty since pc_4_1_compat is */ ++GlobalProperty pc_rhel_8_1_compat[] = { }; ++const size_t pc_rhel_8_1_compat_len = G_N_ELEMENTS(pc_rhel_8_1_compat); ++ ++GlobalProperty pc_rhel_8_0_compat[] = { ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "intel-iommu", "dma-drain", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G4" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G4" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G5" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G5" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC-IBPB" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC-IBPB" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /** The mpx=on entries from pc_compat_3_1 are in pc_rhel_7_6_compat **/ ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Cascadelake-Server" "-" TYPE_X86_CPU, "stepping", "5" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { TYPE_X86_CPU, "x-intel-pt-auto-level", "off" }, ++}; ++const size_t pc_rhel_8_0_compat_len = G_N_ELEMENTS(pc_rhel_8_0_compat); ++ ++/* Similar to PC_COMPAT_3_0 + PC_COMPAT_2_12, but: ++ * all of the 2_12 stuff was already in 7.6 from bz 1481253 ++ * x-migrate-smi-count comes from PC_COMPAT_2_11 but ++ * is really tied to kernel version so keep it off on 7.x ++ * machine types irrespective of host. ++ */ ++GlobalProperty pc_rhel_7_6_compat[] = { ++ /* pc_rhel_7_6_compat from pc_compat_3_0 */ ++ { TYPE_X86_CPU, "x-hv-synic-kvm-only", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_3_0 */ ++ { "Skylake-Server" "-" TYPE_X86_CPU, "pku", "off" }, ++ /* pc_rhel_7_6_compat from pc_compat_3_0 */ ++ { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "pku", "off" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { TYPE_X86_CPU, "x-migrate-smi-count", "off" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Client" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Client-IBRS" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Cascadelake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Icelake-Client" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Icelake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, ++}; ++const size_t pc_rhel_7_6_compat_len = G_N_ELEMENTS(pc_rhel_7_6_compat); ++ ++/* Similar to PC_COMPAT_2_11 + PC_COMPAT_2_10, but: ++ * - x-hv-max-vps was backported to 7.5 ++ * - x-pci-hole64-fix was backported to 7.5 ++ */ ++GlobalProperty pc_rhel_7_5_compat[] = { ++ /* pc_rhel_7_5_compat from pc_compat_2_11 */ ++ { "Skylake-Server" "-" TYPE_X86_CPU, "clflushopt", "off" }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { TYPE_X86_CPU, "legacy-cache", "on" }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { TYPE_X86_CPU, "topoext", "off" }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { "EPYC-" TYPE_X86_CPU, "xlevel", stringify(0x8000000a) }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { "EPYC-IBPB-" TYPE_X86_CPU, "xlevel", stringify(0x8000000a) }, ++}; ++const size_t pc_rhel_7_5_compat_len = G_N_ELEMENTS(pc_rhel_7_5_compat); ++ ++GlobalProperty pc_rhel_7_4_compat[] = { ++ /* pc_rhel_7_4_compat from pc_compat_2_9 */ ++ { "mch", "extended-tseg-mbytes", stringify(0) }, ++ /* bz 1489800 */ ++ { "ICH9-LPC", "__com.redhat_force-rev1-fadt", "on" }, ++ /* pc_rhel_7_4_compat from pc_compat_2_10 */ ++ { "i440FX-pcihost", "x-pci-hole64-fix", "off" }, ++ /* pc_rhel_7_4_compat from pc_compat_2_10 */ ++ { "q35-pcihost", "x-pci-hole64-fix", "off" }, ++ /* pc_rhel_7_4_compat from pc_compat_2_10 */ ++ { TYPE_X86_CPU, "x-hv-max-vps", "0x40" }, ++}; ++const size_t pc_rhel_7_4_compat_len = G_N_ELEMENTS(pc_rhel_7_4_compat); ++ ++GlobalProperty pc_rhel_7_3_compat[] = { ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { "kvmclock", "x-mach-use-reliable-get-clock", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { TYPE_X86_CPU, "l3-cache", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { TYPE_X86_CPU, "full-cpuid-auto-level", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "family", "15" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "model", "6" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "stepping", "1" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "isa-pcspk", "migrate", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_6 */ ++ { TYPE_X86_CPU, "cpuid-0xb", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { "ICH9-LPC", "x-smi-broadcast", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { TYPE_X86_CPU, "vmware-cpuid-freq", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { "Haswell-" TYPE_X86_CPU, "stepping", "1" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_3 added in 2.9*/ ++ { TYPE_X86_CPU, "kvm-no-smi-migration", "on" }, ++}; ++const size_t pc_rhel_7_3_compat_len = G_N_ELEMENTS(pc_rhel_7_3_compat); ++ ++GlobalProperty pc_rhel_7_2_compat[] = { ++ { "phenom" "-" TYPE_X86_CPU, "rdtscp", "off"}, ++ { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" }, ++ { "qemu64" "-" TYPE_X86_CPU, "abm", "on" }, ++ { "Haswell-" TYPE_X86_CPU, "abm", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" }, ++ { "Haswell-noTSX-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-noTSX-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" }, ++ { TYPE_X86_CPU, "check", "off" }, ++ { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" }, ++ { TYPE_X86_CPU, "arat", "off" }, ++ { "usb-redir", "streams", "off" }, ++ { TYPE_X86_CPU, "fill-mtrr-mask", "off" }, ++ { "apic-common", "legacy-instance-id", "on" }, ++}; ++const size_t pc_rhel_7_2_compat_len = G_N_ELEMENTS(pc_rhel_7_2_compat); ++ ++GlobalProperty pc_rhel_7_1_compat[] = { ++ { "kvm64" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "kvm32" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Conroe" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Penryn" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Nehalem" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Nehalem-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Westmere" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Westmere-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "SandyBridge" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "SandyBridge-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Haswell" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Broadwell" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G1" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G2" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G3" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G4" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G5" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Haswell" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Haswell" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "Broadwell" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Broadwell" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "coreduo" "-" TYPE_X86_CPU, "vmx", "on" }, ++ { "core2duo" "-" TYPE_X86_CPU, "vmx", "on" }, ++ { "qemu64" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "kvm64" "-" TYPE_X86_CPU, "min-level", stringify(5) }, ++ { "pentium3" "-" TYPE_X86_CPU, "min-level", stringify(2) }, ++ { "n270" "-" TYPE_X86_CPU, "min-level", stringify(5) }, ++ { "Conroe" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "Penryn" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "Nehalem" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "n270" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Penryn" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Conroe" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Nehalem" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Westmere" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "SandyBridge" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "IvyBridge" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Haswell" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Haswell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Broadwell" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Broadwell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++}; ++const size_t pc_rhel_7_1_compat_len = G_N_ELEMENTS(pc_rhel_7_1_compat); ++ ++/* ++ * The PC_RHEL_*_COMPAT serve the same purpose for RHEL-7 machine ++ * types as the PC_COMPAT_* do for upstream types. ++ * PC_RHEL_7_*_COMPAT apply both to i440fx and q35 types. ++ */ ++ ++/* ++ * RHEL-7 is based on QEMU 1.5.3, so this needs the PC_COMPAT_* ++ * between our base and 1.5, less stuff backported to RHEL-7.0 ++ * (usb-device.msos-desc), less stuff for devices we changed ++ * (qemu64-x86_64-cpu) or don't support (hpet, pci-serial-2x, ++ * pci-serial-4x) in 7.0. ++ */ ++GlobalProperty pc_rhel_7_0_compat[] = { ++ { "virtio-scsi-pci", "any_layout", "off" }, ++ { "PIIX4_PM", "memory-hotplug-support", "off" }, ++ { "apic", "version", stringify(0x11) }, ++ { "nec-usb-xhci", "superspeed-ports-first", "off" }, ++ { "nec-usb-xhci", "force-pcie-endcap", "on" }, ++ { "pci-serial", "prog_if", stringify(0) }, ++ { "virtio-net-pci", "guest_announce", "off" }, ++ { "ICH9-LPC", "memory-hotplug-support", "off" }, ++ { "xio3130-downstream", COMPAT_PROP_PCP, "off" }, ++ { "ioh3420", COMPAT_PROP_PCP, "off" }, ++ { "PIIX4_PM", "acpi-pci-hotplug-with-bridge-support", "off" }, ++ { "e1000", "mitigation", "off" }, ++ { "virtio-net-pci", "ctrl_guest_offloads", "off" }, ++ { "Conroe" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Penryn" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Nehalem" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Nehalem-IBRS" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Westmere" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Westmere-IBRS" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G1" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G2" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G3" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G4" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G5" "-" TYPE_X86_CPU, "x2apic", "on" }, ++}; ++const size_t pc_rhel_7_0_compat_len = G_N_ELEMENTS(pc_rhel_7_0_compat); ++ + void gsi_handler(void *opaque, int n, int level) + { + GSIState *s = opaque; +@@ -1225,7 +1480,8 @@ void pc_memory_init(PCMachineState *pcms, + option_rom_mr = g_malloc(sizeof(*option_rom_mr)); + memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); +- if (pcmc->pci_enabled) { ++ /* RH difference: See bz 1489800, explicitly make ROM ro */ ++ if (pcmc->pc_rom_ro) { + memory_region_set_readonly(option_rom_mr, true); + } + memory_region_add_subregion_overlap(rom_memory, +@@ -2198,6 +2454,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) + pcmc->linuxboot_dma_enabled = true; + pcmc->pvh_enabled = true; + assert(!mc->get_hotplug_handler); ++ pcmc->pc_rom_ro = true; ++ mc->async_pf_vmexit_disable = false; + mc->get_hotplug_handler = pc_get_hotplug_handler; + mc->hotplug_allowed = pc_hotplug_allowed; + mc->cpu_index_to_instance_props = x86_cpu_index_to_props; +@@ -2209,7 +2467,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) + mc->hot_add_cpu = pc_hot_add_cpu; + mc->smp_parse = pc_smp_parse; + mc->block_default_type = IF_IDE; +- mc->max_cpus = 255; ++ /* 240: max CPU count for RHEL */ ++ mc->max_cpus = 240; + mc->reset = pc_machine_reset; + mc->wakeup = pc_machine_wakeup; + hc->pre_plug = pc_machine_device_pre_plug_cb; +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index 1bd70d1abb..bd7fdb99bb 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -53,6 +53,7 @@ + #include "cpu.h" + #include "qapi/error.h" + #include "qemu/error-report.h" ++#include "migration/migration.h" + #ifdef CONFIG_XEN + #include + #include "hw/xen/xen_pt.h" +@@ -173,8 +174,8 @@ static void pc_init1(MachineState *machine, + if (pcmc->smbios_defaults) { + MachineClass *mc = MACHINE_GET_CLASS(machine); + /* These values are guest ABI, do not change */ +- smbios_set_defaults("QEMU", "Standard PC (i440FX + PIIX, 1996)", +- mc->name, pcmc->smbios_legacy_mode, ++ smbios_set_defaults("Red Hat", "KVM", ++ mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, + SMBIOS_ENTRY_POINT_21); + } +@@ -307,6 +308,7 @@ else { + * hw_compat_*, pc_compat_*, or * pc_*_machine_options(). + */ + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void pc_compat_2_3_fn(MachineState *machine) + { + PCMachineState *pcms = PC_MACHINE(machine); +@@ -1026,3 +1028,207 @@ static void xenfv_machine_options(MachineClass *m) + DEFINE_PC_MACHINE(xenfv, "xenfv", pc_xen_hvm_init, + xenfv_machine_options); + #endif ++#endif /* Disabled for Red Hat Enterprise Linux */ ++ ++/* Red Hat Enterprise Linux machine types */ ++ ++/* Options for the latest rhel7 machine type */ ++static void pc_machine_rhel7_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ m->family = "pc_piix_Y"; ++ m->default_machine_opts = "firmware=bios-256k.bin"; ++ pcmc->default_nic_model = "e1000"; ++ m->default_display = "std"; ++ m->no_parallel = 1; ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); ++ compat_props_add(m->compat_props, pc_rhel_compat, pc_rhel_compat_len); ++ m->alias = "pc"; ++ m->is_default = 1; ++} ++ ++static void pc_init_rhel760(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel760_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_machine_rhel7_options(m); ++ m->desc = "RHEL 7.6.0 PC (i440FX + PIIX, 1996)"; ++ m->async_pf_vmexit_disable = true; ++ m->smbus_no_migration_support = true; ++ pcmc->pvh_enabled = false; ++ pcmc->default_cpu_version = CPU_VERSION_LEGACY; ++ compat_props_add(m->compat_props, hw_compat_rhel_8_1, hw_compat_rhel_8_1_len); ++ compat_props_add(m->compat_props, pc_rhel_8_1_compat, pc_rhel_8_1_compat_len); ++ compat_props_add(m->compat_props, hw_compat_rhel_8_0, hw_compat_rhel_8_0_len); ++ compat_props_add(m->compat_props, pc_rhel_8_0_compat, pc_rhel_8_0_compat_len); ++ compat_props_add(m->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++ compat_props_add(m->compat_props, pc_rhel_7_6_compat, pc_rhel_7_6_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel760, "pc-i440fx-rhel7.6.0", pc_init_rhel760, ++ pc_machine_rhel760_options); ++ ++static void pc_init_rhel750(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel750_options(MachineClass *m) ++{ ++ pc_machine_rhel760_options(m); ++ m->alias = NULL; ++ m->is_default = 0; ++ m->desc = "RHEL 7.5.0 PC (i440FX + PIIX, 1996)"; ++ m->auto_enable_numa_with_memhp = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ compat_props_add(m->compat_props, pc_rhel_7_5_compat, pc_rhel_7_5_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel750, "pc-i440fx-rhel7.5.0", pc_init_rhel750, ++ pc_machine_rhel750_options); ++ ++static void pc_init_rhel740(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel740_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_machine_rhel750_options(m); ++ m->desc = "RHEL 7.4.0 PC (i440FX + PIIX, 1996)"; ++ m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; ++ pcmc->pc_rom_ro = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_4, hw_compat_rhel_7_4_len); ++ compat_props_add(m->compat_props, pc_rhel_7_4_compat, pc_rhel_7_4_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel740, "pc-i440fx-rhel7.4.0", pc_init_rhel740, ++ pc_machine_rhel740_options); ++ ++static void pc_init_rhel730(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel730_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_machine_rhel740_options(m); ++ m->desc = "RHEL 7.3.0 PC (i440FX + PIIX, 1996)"; ++ pcmc->linuxboot_dma_enabled = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_3, hw_compat_rhel_7_3_len); ++ compat_props_add(m->compat_props, pc_rhel_7_3_compat, pc_rhel_7_3_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel730, "pc-i440fx-rhel7.3.0", pc_init_rhel730, ++ pc_machine_rhel730_options); ++ ++ ++static void pc_init_rhel720(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel720_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ X86MachineClass *x86mc = X86_MACHINE_CLASS(m); ++ pc_machine_rhel730_options(m); ++ m->desc = "RHEL 7.2.0 PC (i440FX + PIIX, 1996)"; ++ /* From pc_i440fx_2_5_machine_options */ ++ x86mc->save_tsc_khz = false; ++ m->legacy_fw_cfg_order = 1; ++ /* Note: broken_reserved_end was already in 7.2 */ ++ /* From pc_i440fx_2_6_machine_options */ ++ pcmc->legacy_cpu_hotplug = true; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_2, hw_compat_rhel_7_2_len); ++ compat_props_add(m->compat_props, pc_rhel_7_2_compat, pc_rhel_7_2_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel720, "pc-i440fx-rhel7.2.0", pc_init_rhel720, ++ pc_machine_rhel720_options); ++ ++static void pc_compat_rhel710(MachineState *machine) ++{ ++ PCMachineState *pcms = PC_MACHINE(machine); ++ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); ++ ++ /* From pc_compat_2_2 */ ++ pcmc->rsdp_in_ram = false; ++ machine->suppress_vmdesc = true; ++ ++ /* From pc_compat_2_1 */ ++ pcmc->smbios_uuid_encoded = false; ++ x86_cpu_change_kvm_default("svm", NULL); ++ pcmc->enforce_aligned_dimm = false; ++ ++ /* Disable all the extra subsections that were added in 2.2 */ ++ migrate_pre_2_2 = true; ++ ++ /* From pc_i440fx_2_4_machine_options */ ++ pcmc->broken_reserved_end = true; ++} ++ ++static void pc_init_rhel710(MachineState *machine) ++{ ++ pc_compat_rhel710(machine); ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel710_options(MachineClass *m) ++{ ++ pc_machine_rhel720_options(m); ++ m->family = "pc_piix_Y"; ++ m->desc = "RHEL 7.1.0 PC (i440FX + PIIX, 1996)"; ++ m->default_display = "cirrus"; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_1, hw_compat_rhel_7_1_len); ++ compat_props_add(m->compat_props, pc_rhel_7_1_compat, pc_rhel_7_1_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel710, "pc-i440fx-rhel7.1.0", pc_init_rhel710, ++ pc_machine_rhel710_options); ++ ++static void pc_compat_rhel700(MachineState *machine) ++{ ++ PCMachineState *pcms = PC_MACHINE(machine); ++ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); ++ ++ pc_compat_rhel710(machine); ++ ++ /* Upstream enables it for everyone, we're a little more selective */ ++ x86_cpu_change_kvm_default("x2apic", NULL); ++ x86_cpu_change_kvm_default("svm", NULL); ++ pcmc->legacy_acpi_table_size = 6418; /* see pc_compat_2_0() */ ++ pcmc->smbios_legacy_mode = true; ++ pcmc->has_reserved_memory = false; ++ migrate_cve_2014_5263_xhci_fields = true; ++} ++ ++static void pc_init_rhel700(MachineState *machine) ++{ ++ pc_compat_rhel700(machine); ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel700_options(MachineClass *m) ++{ ++ pc_machine_rhel710_options(m); ++ m->family = "pc_piix_Y"; ++ m->desc = "RHEL 7.0.0 PC (i440FX + PIIX, 1996)"; ++ compat_props_add(m->compat_props, pc_rhel_7_0_compat, pc_rhel_7_0_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel700, "pc-i440fx-rhel7.0.0", pc_init_rhel700, ++ pc_machine_rhel700_options); +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 385e5cffb1..7531d8ed76 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -197,8 +197,8 @@ static void pc_q35_init(MachineState *machine) + + if (pcmc->smbios_defaults) { + /* These values are guest ABI, do not change */ +- smbios_set_defaults("QEMU", "Standard PC (Q35 + ICH9, 2009)", +- mc->name, pcmc->smbios_legacy_mode, ++ smbios_set_defaults("Red Hat", "KVM", ++ mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, + SMBIOS_ENTRY_POINT_21); + } +@@ -330,6 +330,7 @@ static void pc_q35_init(MachineState *machine) + DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn) + + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void pc_q35_machine_options(MachineClass *m) + { + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); +@@ -533,3 +534,154 @@ static void pc_q35_2_4_machine_options(MachineClass *m) + + DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL, + pc_q35_2_4_machine_options); ++#endif /* Disabled for Red Hat Enterprise Linux */ ++ ++/* Red Hat Enterprise Linux machine types */ ++ ++/* Options for the latest rhel q35 machine type */ ++static void pc_q35_machine_rhel_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pcmc->default_nic_model = "e1000e"; ++ m->family = "pc_q35_Z"; ++ m->units_per_default_bus = 1; ++ m->default_machine_opts = "firmware=bios-256k.bin"; ++ m->default_display = "std"; ++ m->no_floppy = 1; ++ m->no_parallel = 1; ++ pcmc->default_cpu_version = 1; ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); ++ m->alias = "q35"; ++ m->max_cpus = 384; ++ compat_props_add(m->compat_props, pc_rhel_compat, pc_rhel_compat_len); ++} ++ ++static void pc_q35_init_rhel820(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel820_options(MachineClass *m) ++{ ++ pc_q35_machine_rhel_options(m); ++ m->desc = "RHEL-8.2.0 PC (Q35 + ICH9, 2009)"; ++} ++ ++DEFINE_PC_MACHINE(q35_rhel820, "pc-q35-rhel8.2.0", pc_q35_init_rhel820, ++ pc_q35_machine_rhel820_options); ++ ++static void pc_q35_init_rhel810(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel810_options(MachineClass *m) ++{ ++ pc_q35_machine_rhel820_options(m); ++ m->desc = "RHEL-8.1.0 PC (Q35 + ICH9, 2009)"; ++ m->alias = NULL; ++ compat_props_add(m->compat_props, hw_compat_rhel_8_1, hw_compat_rhel_8_1_len); ++ compat_props_add(m->compat_props, pc_rhel_8_1_compat, pc_rhel_8_1_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel810, "pc-q35-rhel8.1.0", pc_q35_init_rhel810, ++ pc_q35_machine_rhel810_options); ++ ++static void pc_q35_init_rhel800(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel800_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel810_options(m); ++ m->desc = "RHEL-8.0.0 PC (Q35 + ICH9, 2009)"; ++ m->smbus_no_migration_support = true; ++ m->alias = NULL; ++ pcmc->pvh_enabled = false; ++ pcmc->default_cpu_version = CPU_VERSION_LEGACY; ++ compat_props_add(m->compat_props, hw_compat_rhel_8_0, hw_compat_rhel_8_0_len); ++ compat_props_add(m->compat_props, pc_rhel_8_0_compat, pc_rhel_8_0_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel800, "pc-q35-rhel8.0.0", pc_q35_init_rhel800, ++ pc_q35_machine_rhel800_options); ++ ++static void pc_q35_init_rhel760(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel760_options(MachineClass *m) ++{ ++ pc_q35_machine_rhel800_options(m); ++ m->alias = NULL; ++ m->desc = "RHEL-7.6.0 PC (Q35 + ICH9, 2009)"; ++ m->async_pf_vmexit_disable = true; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++ compat_props_add(m->compat_props, pc_rhel_7_6_compat, pc_rhel_7_6_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel760, "pc-q35-rhel7.6.0", pc_q35_init_rhel760, ++ pc_q35_machine_rhel760_options); ++ ++static void pc_q35_init_rhel750(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel750_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel760_options(m); ++ m->alias = NULL; ++ m->desc = "RHEL-7.5.0 PC (Q35 + ICH9, 2009)"; ++ m->auto_enable_numa_with_memhp = false; ++ pcmc->default_nic_model = "e1000"; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ compat_props_add(m->compat_props, pc_rhel_7_5_compat, pc_rhel_7_5_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel750, "pc-q35-rhel7.5.0", pc_q35_init_rhel750, ++ pc_q35_machine_rhel750_options); ++ ++static void pc_q35_init_rhel740(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel740_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel750_options(m); ++ m->desc = "RHEL-7.4.0 PC (Q35 + ICH9, 2009)"; ++ m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; ++ pcmc->pc_rom_ro = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_4, hw_compat_rhel_7_4_len); ++ compat_props_add(m->compat_props, pc_rhel_7_4_compat, pc_rhel_7_4_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel740, "pc-q35-rhel7.4.0", pc_q35_init_rhel740, ++ pc_q35_machine_rhel740_options); ++ ++static void pc_q35_init_rhel730(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel730_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel740_options(m); ++ m->desc = "RHEL-7.3.0 PC (Q35 + ICH9, 2009)"; ++ m->max_cpus = 255; ++ pcmc->linuxboot_dma_enabled = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_3, hw_compat_rhel_7_3_len); ++ compat_props_add(m->compat_props, pc_rhel_7_3_compat, pc_rhel_7_3_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel730, "pc-q35-rhel7.3.0", pc_q35_init_rhel730, ++ pc_q35_machine_rhel730_options); +diff --git a/include/hw/boards.h b/include/hw/boards.h +index 6f85a0e032..2920bdef5b 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -222,6 +222,8 @@ struct MachineClass { + const char **valid_cpu_types; + strList *allowed_dynamic_sysbus_devices; + bool auto_enable_numa_with_memhp; ++ /* RHEL only */ ++ bool async_pf_vmexit_disable; + void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size); + bool ignore_boot_device_suffixes; +diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h +index 1f86eba3f9..2e362c8faa 100644 +--- a/include/hw/i386/pc.h ++++ b/include/hw/i386/pc.h +@@ -124,6 +124,9 @@ typedef struct PCMachineClass { + + /* use PVH to load kernels that support this feature */ + bool pvh_enabled; ++ ++ /* RH only, see bz 1489800 */ ++ bool pc_rom_ro; + } PCMachineClass; + + #define TYPE_PC_MACHINE "generic-pc-machine" +@@ -300,6 +303,36 @@ extern const size_t pc_compat_1_5_len; + extern GlobalProperty pc_compat_1_4[]; + extern const size_t pc_compat_1_4_len; + ++extern GlobalProperty pc_rhel_compat[]; ++extern const size_t pc_rhel_compat_len; ++ ++extern GlobalProperty pc_rhel_8_1_compat[]; ++extern const size_t pc_rhel_8_1_compat_len; ++ ++extern GlobalProperty pc_rhel_8_0_compat[]; ++extern const size_t pc_rhel_8_0_compat_len; ++ ++extern GlobalProperty pc_rhel_7_6_compat[]; ++extern const size_t pc_rhel_7_6_compat_len; ++ ++extern GlobalProperty pc_rhel_7_5_compat[]; ++extern const size_t pc_rhel_7_5_compat_len; ++ ++extern GlobalProperty pc_rhel_7_4_compat[]; ++extern const size_t pc_rhel_7_4_compat_len; ++ ++extern GlobalProperty pc_rhel_7_3_compat[]; ++extern const size_t pc_rhel_7_3_compat_len; ++ ++extern GlobalProperty pc_rhel_7_2_compat[]; ++extern const size_t pc_rhel_7_2_compat_len; ++ ++extern GlobalProperty pc_rhel_7_1_compat[]; ++extern const size_t pc_rhel_7_1_compat_len; ++ ++extern GlobalProperty pc_rhel_7_0_compat[]; ++extern const size_t pc_rhel_7_0_compat_len; ++ + /* Helper for setting model-id for CPU models that changed model-id + * depending on QEMU versions up to QEMU 2.4. + */ +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 1b7880ae3a..790db778ab 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1829,11 +1829,17 @@ static CPUCaches epyc_cache_info = { + + static X86CPUDefinition builtin_x86_defs[] = { + { ++ /* qemu64 is the default CPU model for all *-rhel7.* machine-types. ++ * The default on RHEL-6 was cpu64-rhel6. ++ * libvirt assumes that qemu64 is the default for _all_ machine-types, ++ * so we should try to keep qemu64 and cpu64-rhel6 as similar as ++ * possible. ++ */ + .name = "qemu64", + .level = 0xd, + .vendor = CPUID_VENDOR_AMD, + .family = 6, +- .model = 6, ++ .model = 13, + .stepping = 3, + .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | + CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | +@@ -3932,6 +3938,7 @@ static PropValue kvm_default_props[] = { + { "acpi", "off" }, + { "monitor", "off" }, + { "svm", "off" }, ++ { "kvm-pv-unhalt", "on" }, + { NULL, NULL }, + }; + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 1d10046a6c..86d9a1f364 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -3079,6 +3079,7 @@ static int kvm_get_msrs(X86CPU *cpu) + struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries; + int ret, i; + uint64_t mtrr_top_bits; ++ MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); + + kvm_msr_buf_reset(cpu); + +@@ -3388,6 +3389,9 @@ static int kvm_get_msrs(X86CPU *cpu) + break; + case MSR_KVM_ASYNC_PF_EN: + env->async_pf_en_msr = msrs[i].data; ++ if (mc->async_pf_vmexit_disable) { ++ env->async_pf_en_msr &= ~(1ULL << 2); ++ } + break; + case MSR_KVM_PV_EOI_EN: + env->pv_eoi_en_msr = msrs[i].data; +-- +2.21.0 + diff --git a/0012-Enable-make-check.patch b/0012-Enable-make-check.patch new file mode 100755 index 0000000..09f7b4e --- /dev/null +++ b/0012-Enable-make-check.patch @@ -0,0 +1,307 @@ +From 154215041df085271a780a2989f4f481226e3e34 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:48:41 +0200 +Subject: Enable make check + +Fixing tests after device disabling and machine types changes and enabling +make check run during build. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- Remove testing for pseries-2.7 in endianess test +- Disable device-plug-test on s390x as it use disabled device +- Do not run cpu-plug-tests on 7.3 and older machine types + +Rebase changes (4.1.0-rc0): +- removed iotests 068 + +Rebase changes (4.1.0-rc1): +- remove all 205 tests (unstable) + +Rebase changes (4.2.0-rc0): +- partially disable hd-geo-test (requires lsi53c895a) + +Merged patches (4.0.0): +- f7ffd13 Remove 7 qcow2 and luks iotests that are taking > 25 sec to run during the fast train build proce + +Merged patches (4.1.0-rc0): +- 41288ff redhat: Remove raw iotest 205 + +Signed-off-by: Danilo C. L. de Paula +--- + redhat/qemu-kvm.spec.template | 2 +- + tests/Makefile.include | 10 +++++----- + tests/boot-serial-test.c | 6 +++++- + tests/cpu-plug-test.c | 4 ++-- + tests/e1000-test.c | 2 ++ + tests/hd-geo-test.c | 4 ++++ + tests/prom-env-test.c | 4 ++++ + tests/qemu-iotests/051 | 12 ++++++------ + tests/qemu-iotests/group | 4 ++-- + tests/test-x86-cpuid-compat.c | 2 ++ + tests/usb-hcd-xhci-test.c | 4 ++++ + 11 files changed, 37 insertions(+), 17 deletions(-) + +diff --git a/tests/Makefile.include b/tests/Makefile.include +index b483790cf3..53bdbdfee0 100644 +--- a/tests/Makefile.include ++++ b/tests/Makefile.include +@@ -172,7 +172,7 @@ check-qtest-i386-y += tests/ide-test$(EXESUF) + check-qtest-i386-y += tests/ahci-test$(EXESUF) + check-qtest-i386-y += tests/hd-geo-test$(EXESUF) + check-qtest-i386-y += tests/boot-order-test$(EXESUF) +-check-qtest-i386-y += tests/bios-tables-test$(EXESUF) ++#check-qtest-i386-y += tests/bios-tables-test$(EXESUF) + check-qtest-i386-$(CONFIG_SGA) += tests/boot-serial-test$(EXESUF) + check-qtest-i386-$(CONFIG_SLIRP) += tests/pxe-test$(EXESUF) + check-qtest-i386-y += tests/rtc-test$(EXESUF) +@@ -230,7 +230,7 @@ check-qtest-mips64el-$(CONFIG_VGA) += tests/display-vga-test$(EXESUF) + check-qtest-moxie-y += tests/boot-serial-test$(EXESUF) + + check-qtest-ppc-$(CONFIG_ISA_TESTDEV) = tests/endianness-test$(EXESUF) +-check-qtest-ppc-y += tests/boot-order-test$(EXESUF) ++#check-qtest-ppc-y += tests/boot-order-test$(EXESUF) + check-qtest-ppc-y += tests/prom-env-test$(EXESUF) + check-qtest-ppc-y += tests/drive_del-test$(EXESUF) + check-qtest-ppc-y += tests/boot-serial-test$(EXESUF) +@@ -244,8 +244,8 @@ check-qtest-ppc64-$(CONFIG_PSERIES) += tests/rtas-test$(EXESUF) + check-qtest-ppc64-$(CONFIG_SLIRP) += tests/pxe-test$(EXESUF) + check-qtest-ppc64-$(CONFIG_USB_UHCI) += tests/usb-hcd-uhci-test$(EXESUF) + check-qtest-ppc64-$(CONFIG_USB_XHCI_NEC) += tests/usb-hcd-xhci-test$(EXESUF) +-check-qtest-ppc64-$(CONFIG_SLIRP) += tests/test-netfilter$(EXESUF) +-check-qtest-ppc64-$(CONFIG_POSIX) += tests/test-filter-mirror$(EXESUF) ++#check-qtest-ppc64-$(CONFIG_SLIRP) += tests/test-netfilter$(EXESUF) ++#check-qtest-ppc64-$(CONFIG_POSIX) += tests/test-filter-mirror$(EXESUF) + check-qtest-ppc64-$(CONFIG_RTL8139_PCI) += tests/test-filter-redirector$(EXESUF) + check-qtest-ppc64-$(CONFIG_VGA) += tests/display-vga-test$(EXESUF) + check-qtest-ppc64-y += tests/numa-test$(EXESUF) +@@ -291,7 +291,7 @@ check-qtest-s390x-$(CONFIG_SLIRP) += tests/test-netfilter$(EXESUF) + check-qtest-s390x-$(CONFIG_POSIX) += tests/test-filter-mirror$(EXESUF) + check-qtest-s390x-$(CONFIG_POSIX) += tests/test-filter-redirector$(EXESUF) + check-qtest-s390x-y += tests/drive_del-test$(EXESUF) +-check-qtest-s390x-y += tests/device-plug-test$(EXESUF) ++#check-qtest-s390x-y += tests/device-plug-test$(EXESUF) + check-qtest-s390x-y += tests/virtio-ccw-test$(EXESUF) + check-qtest-s390x-y += tests/cpu-plug-test$(EXESUF) + check-qtest-s390x-y += tests/migration-test$(EXESUF) +diff --git a/tests/boot-serial-test.c b/tests/boot-serial-test.c +index d3a54a0ba5..33ce72b89c 100644 +--- a/tests/boot-serial-test.c ++++ b/tests/boot-serial-test.c +@@ -108,19 +108,23 @@ static testdef_t tests[] = { + { "ppc", "g3beige", "", "PowerPC,750" }, + { "ppc", "mac99", "", "PowerPC,G4" }, + { "ppc", "sam460ex", "-m 256", "DRAM: 256 MiB" }, ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + { "ppc64", "ppce500", "", "U-Boot" }, + { "ppc64", "40p", "-m 192", "Memory: 192M" }, + { "ppc64", "mac99", "", "PowerPC,970FX" }, ++#endif + { "ppc64", "pseries", + "-machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken", + "Open Firmware" }, ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + { "ppc64", "powernv8", "", "OPAL" }, + { "ppc64", "powernv9", "", "OPAL" }, + { "ppc64", "sam460ex", "-device e1000", "8086 100e" }, ++#endif + { "i386", "isapc", "-cpu qemu32 -device sga", "SGABIOS" }, + { "i386", "pc", "-device sga", "SGABIOS" }, + { "i386", "q35", "-device sga", "SGABIOS" }, +- { "x86_64", "isapc", "-cpu qemu32 -device sga", "SGABIOS" }, ++ { "x86_64", "pc", "-cpu qemu32 -device sga", "SGABIOS" }, + { "x86_64", "q35", "-device sga", "SGABIOS" }, + { "sparc", "LX", "", "TMS390S10" }, + { "sparc", "SS-4", "", "MB86904" }, +diff --git a/tests/cpu-plug-test.c b/tests/cpu-plug-test.c +index 30e514bbfb..a04beae1c6 100644 +--- a/tests/cpu-plug-test.c ++++ b/tests/cpu-plug-test.c +@@ -185,8 +185,8 @@ static void add_pseries_test_case(const char *mname) + char *path; + PlugTestData *data; + +- if (!g_str_has_prefix(mname, "pseries-") || +- (g_str_has_prefix(mname, "pseries-2.") && atoi(&mname[10]) < 7)) { ++ if (!g_str_has_prefix(mname, "pseries-rhel") || ++ (g_str_has_prefix(mname, "pseries-rhel7.") && atoi(&mname[14]) < 4)) { + return; + } + data = g_new(PlugTestData, 1); +diff --git a/tests/e1000-test.c b/tests/e1000-test.c +index c387984ef6..c89112d6f8 100644 +--- a/tests/e1000-test.c ++++ b/tests/e1000-test.c +@@ -22,9 +22,11 @@ struct QE1000 { + + static const char *models[] = { + "e1000", ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + "e1000-82540em", + "e1000-82544gc", + "e1000-82545em", ++#endif + }; + + static void *e1000_get_driver(void *obj, const char *interface) +diff --git a/tests/hd-geo-test.c b/tests/hd-geo-test.c +index 7e86c5416c..cc068bad87 100644 +--- a/tests/hd-geo-test.c ++++ b/tests/hd-geo-test.c +@@ -732,6 +732,7 @@ static void test_override_ide(void) + test_override(args, expected); + } + ++#if 0 /* Require lsi53c895a - not supported on RHEL */ + static void test_override_scsi(void) + { + TestArgs *args = create_args(); +@@ -776,6 +777,7 @@ static void test_override_scsi_2_controllers(void) + add_scsi_disk(args, 3, 1, 0, 1, 2, 0, 1, 0); + test_override(args, expected); + } ++#endif + + static void test_override_virtio_blk(void) + { +@@ -951,9 +953,11 @@ int main(int argc, char **argv) + qtest_add_func("hd-geo/ide/device/user/chst", test_ide_device_user_chst); + if (have_qemu_img()) { + qtest_add_func("hd-geo/override/ide", test_override_ide); ++#if 0 /* Require lsi53c895a - not supported on RHEL */ + qtest_add_func("hd-geo/override/scsi", test_override_scsi); + qtest_add_func("hd-geo/override/scsi_2_controllers", + test_override_scsi_2_controllers); ++#endif + qtest_add_func("hd-geo/override/virtio_blk", test_override_virtio_blk); + qtest_add_func("hd-geo/override/zero_chs", test_override_zero_chs); + qtest_add_func("hd-geo/override/scsi_hot_unplug", +diff --git a/tests/prom-env-test.c b/tests/prom-env-test.c +index 61bc1d1e7b..028d45c7d7 100644 +--- a/tests/prom-env-test.c ++++ b/tests/prom-env-test.c +@@ -88,10 +88,14 @@ int main(int argc, char *argv[]) + if (!strcmp(arch, "ppc")) { + add_tests(ppc_machines); + } else if (!strcmp(arch, "ppc64")) { ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + add_tests(ppc_machines); + if (g_test_slow()) { ++#endif + qtest_add_data_func("prom-env/pseries", "pseries", test_machine); ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + } ++#endif + } else if (!strcmp(arch, "sparc")) { + add_tests(sparc_machines); + } else if (!strcmp(arch, "sparc64")) { +diff --git a/tests/qemu-iotests/051 b/tests/qemu-iotests/051 +index 53bcdbc911..b387e0c233 100755 +--- a/tests/qemu-iotests/051 ++++ b/tests/qemu-iotests/051 +@@ -181,11 +181,11 @@ run_qemu -drive if=virtio + case "$QEMU_DEFAULT_MACHINE" in + pc) + run_qemu -drive if=none,id=disk -device ide-cd,drive=disk +- run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-cd,drive=disk ++# run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-cd,drive=disk + run_qemu -drive if=none,id=disk -device ide-drive,drive=disk + run_qemu -drive if=none,id=disk -device ide-hd,drive=disk +- run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-disk,drive=disk +- run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-hd,drive=disk ++# run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-disk,drive=disk ++# run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-hd,drive=disk + ;; + *) + ;; +@@ -234,11 +234,11 @@ run_qemu -drive file="$TEST_IMG",if=virtio,readonly=on + case "$QEMU_DEFAULT_MACHINE" in + pc) + run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device ide-cd,drive=disk +- run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-cd,drive=disk ++# run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-cd,drive=disk + run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device ide-drive,drive=disk + run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device ide-hd,drive=disk +- run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-disk,drive=disk +- run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-hd,drive=disk ++# run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-disk,drive=disk ++# run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-hd,drive=disk + ;; + *) + ;; +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 6b10a6a762..06cc734b26 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -92,7 +92,7 @@ + 068 rw quick + 069 rw auto quick + 070 rw quick +-071 rw auto quick ++# 071 rw auto quick -- requires whitelisted blkverify + 072 rw auto quick + 073 rw auto quick + 074 rw auto quick +@@ -120,7 +120,7 @@ + 096 rw quick + 097 rw auto backing + 098 rw auto backing quick +-099 rw auto quick ++# 099 rw auto quick -- requires whitelisted blkverify + # 100 was removed, do not reuse + 101 rw quick + 102 rw quick +diff --git a/tests/test-x86-cpuid-compat.c b/tests/test-x86-cpuid-compat.c +index 772287bdb4..e7c075ed98 100644 +--- a/tests/test-x86-cpuid-compat.c ++++ b/tests/test-x86-cpuid-compat.c +@@ -300,6 +300,7 @@ int main(int argc, char **argv) + "-cpu 486,xlevel2=0xC0000002,+xstore", + "xlevel2", 0xC0000002); + ++#if 0 /* Disabled in Red Hat Enterprise Linux */ + /* Check compatibility of old machine-types that didn't + * auto-increase level/xlevel/xlevel2: */ + +@@ -350,6 +351,7 @@ int main(int argc, char **argv) + add_cpuid_test("x86/cpuid/xlevel-compat/pc-i440fx-2.4/npt-on", + "-machine pc-i440fx-2.4 -cpu SandyBridge,+npt", + "xlevel", 0x80000008); ++#endif + + /* Test feature parsing */ + add_feature_test("x86/cpuid/features/plus", +diff --git a/tests/usb-hcd-xhci-test.c b/tests/usb-hcd-xhci-test.c +index 10ef9d2a91..3855873050 100644 +--- a/tests/usb-hcd-xhci-test.c ++++ b/tests/usb-hcd-xhci-test.c +@@ -21,6 +21,7 @@ static void test_xhci_hotplug(void) + usb_test_hotplug(global_qtest, "xhci", "1", NULL); + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void test_usb_uas_hotplug(void) + { + QTestState *qts = global_qtest; +@@ -36,6 +37,7 @@ static void test_usb_uas_hotplug(void) + qtest_qmp_device_del(qts, "scsihd"); + qtest_qmp_device_del(qts, "uas"); + } ++#endif + + static void test_usb_ccid_hotplug(void) + { +@@ -56,7 +58,9 @@ int main(int argc, char **argv) + + qtest_add_func("/xhci/pci/init", test_xhci_init); + qtest_add_func("/xhci/pci/hotplug", test_xhci_hotplug); ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + qtest_add_func("/xhci/pci/hotplug/usb-uas", test_usb_uas_hotplug); ++#endif + qtest_add_func("/xhci/pci/hotplug/usb-ccid", test_usb_ccid_hotplug); + + qtest_start("-device nec-usb-xhci,id=xhci" +-- +2.21.0 + diff --git a/0013-vfio-cap-number-of-devices-that-can-be-assigned.patch b/0013-vfio-cap-number-of-devices-that-can-be-assigned.patch new file mode 100755 index 0000000..db776c4 --- /dev/null +++ b/0013-vfio-cap-number-of-devices-that-can-be-assigned.patch @@ -0,0 +1,114 @@ +From de433da59448eaad4ac1b902d07d57b57f922aff Mon Sep 17 00:00:00 2001 +From: Bandan Das +Date: Tue, 3 Dec 2013 20:05:13 +0100 +Subject: vfio: cap number of devices that can be assigned + +RH-Author: Bandan Das +Message-id: <1386101113-31560-3-git-send-email-bsd@redhat.com> +Patchwork-id: 55984 +O-Subject: [PATCH RHEL7 qemu-kvm v2 2/2] vfio: cap number of devices that can be assigned +Bugzilla: 678368 +RH-Acked-by: Alex Williamson +RH-Acked-by: Marcelo Tosatti +RH-Acked-by: Michael S. Tsirkin + +Go through all groups to get count of total number of devices +active to enforce limit + +Reasoning from Alex for the limit(32) - Assuming 3 slots per +device, with 125 slots (number of memory slots for RHEL 7), +we can support almost 40 devices and still have few slots left +for other uses. Stepping down a bit, the number 32 arbitrarily +matches the number of slots on a PCI bus and is also a nice power +of two. + +Signed-off-by: Bandan Das + +Rebase notes (2.8.0): +- removed return value for vfio_realize (commit 1a22aca) + +Merged patches (2.9.0): +- 17eb774 vfio: Use error_setg when reporting max assigned device overshoot + + Merged patches (4.1.0-rc3): +- 2b89558 vfio: increase the cap on number of assigned devices to 64 + +(cherry picked from commit 9fa3c9fc6dfcde76d80db1aa601b2d577f72ceec) +(cherry picked from commit 3cb35556dc7d994f203d732fe952f95fcdb03c0a) +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/pci.c | 29 ++++++++++++++++++++++++++++- + hw/vfio/pci.h | 1 + + 2 files changed, 29 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index c8534d3035..309535f306 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -47,6 +47,9 @@ + + #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" + ++/* RHEL only: Set once for the first assigned dev */ ++static uint16_t device_limit; ++ + static void vfio_disable_interrupts(VFIOPCIDevice *vdev); + static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); + +@@ -2722,9 +2725,30 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + ssize_t len; + struct stat st; + int groupid; +- int i, ret; ++ int ret, i = 0; + bool is_mdev; + ++ if (device_limit && device_limit != vdev->assigned_device_limit) { ++ error_setg(errp, "Assigned device limit has been redefined. " ++ "Old:%d, New:%d", ++ device_limit, vdev->assigned_device_limit); ++ return; ++ } else { ++ device_limit = vdev->assigned_device_limit; ++ } ++ ++ QLIST_FOREACH(group, &vfio_group_list, next) { ++ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { ++ i++; ++ } ++ } ++ ++ if (i >= vdev->assigned_device_limit) { ++ error_setg(errp, "Maximum supported vfio devices (%d) " ++ "already attached", vdev->assigned_device_limit); ++ return; ++ } ++ + if (!vdev->vbasedev.sysfsdev) { + if (!(~vdev->host.domain || ~vdev->host.bus || + ~vdev->host.slot || ~vdev->host.function)) { +@@ -3167,6 +3191,9 @@ static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), + DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, + no_geforce_quirks, false), ++ /* RHEL only */ ++ DEFINE_PROP_UINT16("x-assigned-device-limit", VFIOPCIDevice, ++ assigned_device_limit, 64), + DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, + false), + DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 35626cd63e..0cd4803aee 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -135,6 +135,7 @@ typedef struct VFIOPCIDevice { + EventNotifier err_notifier; + EventNotifier req_notifier; + int (*resetfn)(struct VFIOPCIDevice *); ++ uint16_t assigned_device_limit; + uint32_t vendor_id; + uint32_t device_id; + uint32_t sub_vendor_id; +-- +2.21.0 + diff --git a/0014-Add-support-statement-to-help-output.patch b/0014-Add-support-statement-to-help-output.patch new file mode 100755 index 0000000..cb77bfe --- /dev/null +++ b/0014-Add-support-statement-to-help-output.patch @@ -0,0 +1,58 @@ +From 2754dd8da8975757753fd491985d5e7b36966106 Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Wed, 4 Dec 2013 18:53:17 +0100 +Subject: Add support statement to -help output + +RH-Author: Eduardo Habkost +Message-id: <1386183197-27761-1-git-send-email-ehabkost@redhat.com> +Patchwork-id: 55994 +O-Subject: [qemu-kvm RHEL7 PATCH] Add support statement to -help output +Bugzilla: 972773 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: knoel@redhat.com +RH-Acked-by: Paolo Bonzini + +Add support statement to -help output, reporting direct qemu-kvm usage +as unsupported by Red Hat, and advising users to use libvirt instead. + +Signed-off-by: Eduardo Habkost +(cherry picked from commit 2a07700936e39856cc9f149c6a6517f0715536a6) +(cherry picked from commit 5dd2f4706e2fef945771949e59a8fcc1b5452de9) +Signed-off-by: Danilo C. L. de Paula +--- + vl.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/vl.c b/vl.c +index 668a34577e..9f3e7e7733 100644 +--- a/vl.c ++++ b/vl.c +@@ -1822,9 +1822,17 @@ static void version(void) + QEMU_COPYRIGHT "\n"); + } + ++static void print_rh_warning(void) ++{ ++ printf("\nWARNING: Direct use of qemu-kvm from the command line is not supported by Red Hat.\n" ++ "WARNING: Use libvirt as the stable management interface.\n" ++ "WARNING: Some command line options listed here may not be available in future releases.\n\n"); ++} ++ + static void help(int exitcode) + { + version(); ++ print_rh_warning(); + printf("usage: %s [options] [disk_image]\n\n" + "'disk_image' is a raw hard disk image for IDE hard disk 0\n\n", + error_get_progname()); +@@ -1841,6 +1849,7 @@ static void help(int exitcode) + "\n" + QEMU_HELP_BOTTOM "\n"); + ++ print_rh_warning(); + exit(exitcode); + } + +-- +2.21.0 + diff --git a/0015-globally-limit-the-maximum-number-of-CPUs.patch b/0015-globally-limit-the-maximum-number-of-CPUs.patch new file mode 100755 index 0000000..cec862d --- /dev/null +++ b/0015-globally-limit-the-maximum-number-of-CPUs.patch @@ -0,0 +1,152 @@ +From c9c3cf721b0e9e359418f64c2a5121c3f8b5d27a Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Tue, 21 Jan 2014 10:46:52 +0100 +Subject: globally limit the maximum number of CPUs + +We now globally limit the number of VCPUs. +Especially, there is no way one can specify more than +max_cpus VCPUs for a VM. + +This allows us the restore the ppc max_cpus limitation to the upstream +default and minimize the ppc hack in kvm-all.c. + +Signed-off-by: David Hildenbrand +Signed-off-by: Miroslav Rezanina +Signed-off-by: Danilo Cesar Lemes de Paula + +Rebase notes (2.11.0): +- Removed CONFIG_RHV reference +- Update commit log + +Merged patches (2.11.0): +- 92fef14623 redhat: remove manual max_cpus limitations for ppc +- bb722e9eff redhat: globally limit the maximum number of CPUs +- fdeef3c1c7 RHEL: Set vcpus hard limit to 240 for Power +- 0584216921 Match POWER max cpus to x86 + +Signed-off-by: Andrew Jones +(cherry picked from commit a4ceb63bdc5cbac19f5f633ec761b9de0dedb55e) +(cherry picked from commit a1f26d85171b4d554225150053700e93ba6eba10) + +redhat: globally limit the maximum number of CPUs + +RH-Author: David Hildenbrand +Message-id: <20180109103253.24517-2-david@redhat.com> +Patchwork-id: 78531 +O-Subject: [RHEL-7.5 qemu-kvm-ma PATCH v2 1/2] redhat: globally limit the maximum number of CPUs +Bugzilla: 1527449 +RH-Acked-by: David Gibson +RH-Acked-by: Thomas Huth +RH-Acked-by: Cornelia Huck + +Upstream-status: n/a + +For RHEL, we support 240, for RHV up to 384 VCPUs. Let's limit this +globally instead of fixing up all machines. This way, we can easily +change (increase) the product specific levels later. + +Signed-off-by: David Hildenbrand +Signed-off-by: Miroslav Rezanina + +redhat: remove manual max_cpus limitations for ppc + +RH-Author: David Hildenbrand +Message-id: <20180109103253.24517-3-david@redhat.com> +Patchwork-id: 78532 +O-Subject: [RHEL-7.5 qemu-kvm-ma PATCH v2 2/2] redhat: remove manual max_cpus limitations for ppc +Bugzilla: 1527449 +RH-Acked-by: David Gibson +RH-Acked-by: Thomas Huth +RH-Acked-by: Cornelia Huck + +Upstream-status: n/a + +RH-Author: Andrew Jones +Message-id: <1390301212-15344-1-git-send-email-drjones@redhat.com> +Patchwork-id: 56862 +O-Subject: [RHEL7.0 qemu-kvm PATCH v6] use recommended max vcpu count +Bugzilla: 998708 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Marcelo Tosatti + +The recommended vcpu max limit (KVM_CAP_NR_VCPUS) should be used instead +of the actual max vcpu limit (KVM_CAP_MAX_VCPUS) to give an error. + +This commit matches the limit to current KVM_CAP_NR_VCPUS value. + +Signed-off-by: Danilo C. L. de Paula +--- + accel/kvm/kvm-all.c | 12 ++++++++++++ + vl.c | 18 ++++++++++++++++++ + 2 files changed, 30 insertions(+) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index ca00daa2f5..dc3ed7f04e 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -1943,6 +1943,18 @@ static int kvm_init(MachineState *ms) + soft_vcpus_limit = kvm_recommended_vcpus(s); + hard_vcpus_limit = kvm_max_vcpus(s); + ++#ifdef HOST_PPC64 ++ /* ++ * On POWER, the kernel advertises a soft limit based on the ++ * number of CPU threads on the host. We want to allow exceeding ++ * this for testing purposes, so we don't want to set hard limit ++ * to soft limit as on x86. ++ */ ++#else ++ /* RHEL doesn't support nr_vcpus > soft_vcpus_limit */ ++ hard_vcpus_limit = soft_vcpus_limit; ++#endif ++ + while (nc->name) { + if (nc->num > soft_vcpus_limit) { + warn_report("Number of %s cpus requested (%d) exceeds " +diff --git a/vl.c b/vl.c +index 9f3e7e7733..1550aa2aaa 100644 +--- a/vl.c ++++ b/vl.c +@@ -134,6 +134,8 @@ int main(int argc, char **argv) + + #define MAX_VIRTIO_CONSOLES 1 + ++#define RHEL_MAX_CPUS 384 ++ + static const char *data_dir[16]; + static int data_dir_idx; + const char *bios_name = NULL; +@@ -1339,6 +1341,20 @@ static MachineClass *find_default_machine(GSList *machines) + return NULL; + } + ++/* Maximum number of CPUs limited for Red Hat Enterprise Linux */ ++static void limit_max_cpus_in_machines(void) ++{ ++ GSList *el, *machines = object_class_get_list(TYPE_MACHINE, false); ++ ++ for (el = machines; el; el = el->next) { ++ MachineClass *mc = el->data; ++ ++ if (mc->max_cpus > RHEL_MAX_CPUS) { ++ mc->max_cpus = RHEL_MAX_CPUS; ++ } ++ } ++} ++ + static int machine_help_func(QemuOpts *opts, MachineState *machine) + { + ObjectProperty *prop; +@@ -3857,6 +3873,8 @@ int main(int argc, char **argv, char **envp) + "mutually exclusive"); + exit(EXIT_FAILURE); + } ++ /* Maximum number of CPUs limited for Red Hat Enterprise Linux */ ++ limit_max_cpus_in_machines(); + + configure_rtc(qemu_find_opts_singleton("rtc")); + +-- +2.21.0 + diff --git a/0016-Add-support-for-simpletrace.patch b/0016-Add-support-for-simpletrace.patch new file mode 100755 index 0000000..9624855 --- /dev/null +++ b/0016-Add-support-for-simpletrace.patch @@ -0,0 +1,121 @@ +From 26128b3ede339e292a3c50a84e3248af46ecd0ec Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Thu, 8 Oct 2015 09:50:17 +0200 +Subject: Add support for simpletrace + +As simpletrace is upstream, we just need to properly handle it during rpmbuild. + +Signed-off-by: Miroslav Rezanina + +Rebase notes (3.1.0): +- Fixed python 2 to python3 switch + +Rebase notes (2.9.0): +- Added group argument for tracetool.py (upstream) + +Rebase notes (2.8.0): +- Changed tracetool.py parameters + +Merged patches (2.3.0): +- db959d6 redhat/qemu-kvm.spec.template: Install qemu-kvm-simpletrace.stp +- 5292fc3 trace: add SystemTap init scripts for simpletrace bridge +- eda9e5e simpletrace: install simpletrace.py +- 85c4c8f trace: add systemtap-initscript README file to RPM + +Signed-off-by: Danilo C. L. de Paula +--- + .gitignore | 2 ++ + Makefile | 4 +++ + README.systemtap | 43 +++++++++++++++++++++++++ + redhat/qemu-kvm.spec.template | 26 ++++++++++++++- + scripts/systemtap/conf.d/qemu_kvm.conf | 4 +++ + scripts/systemtap/script.d/qemu_kvm.stp | 1 + + 6 files changed, 79 insertions(+), 1 deletion(-) + create mode 100644 README.systemtap + create mode 100644 scripts/systemtap/conf.d/qemu_kvm.conf + create mode 100644 scripts/systemtap/script.d/qemu_kvm.stp + +diff --git a/Makefile b/Makefile +index 086727dbb9..4254950f7f 100644 +--- a/Makefile ++++ b/Makefile +@@ -939,6 +939,10 @@ endif + $(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \ + done + $(INSTALL_DATA) $(BUILD_DIR)/trace-events-all "$(DESTDIR)$(qemu_datadir)/trace-events-all" ++ $(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/systemtap/script.d" ++ $(INSTALL_DATA) $(SRC_PATH)/scripts/systemtap/script.d/qemu_kvm.stp "$(DESTDIR)$(qemu_datadir)/systemtap/script.d/" ++ $(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/systemtap/conf.d" ++ $(INSTALL_DATA) $(SRC_PATH)/scripts/systemtap/conf.d/qemu_kvm.conf "$(DESTDIR)$(qemu_datadir)/systemtap/conf.d/" + + .PHONY: ctags + ctags: +diff --git a/README.systemtap b/README.systemtap +new file mode 100644 +index 0000000000..ad913fc990 +--- /dev/null ++++ b/README.systemtap +@@ -0,0 +1,43 @@ ++QEMU tracing using systemtap-initscript ++--------------------------------------- ++ ++You can capture QEMU trace data all the time using systemtap-initscript. This ++uses SystemTap's flight recorder mode to trace all running guests to a ++fixed-size buffer on the host. Old trace entries are overwritten by new ++entries when the buffer size wraps. ++ ++1. Install the systemtap-initscript package: ++ # yum install systemtap-initscript ++ ++2. Install the systemtap scripts and the conf file: ++ # cp /usr/share/qemu-kvm/systemtap/script.d/qemu_kvm.stp /etc/systemtap/script.d/ ++ # cp /usr/share/qemu-kvm/systemtap/conf.d/qemu_kvm.conf /etc/systemtap/conf.d/ ++ ++The set of trace events to enable is given in qemu_kvm.stp. This SystemTap ++script can be customized to add or remove trace events provided in ++/usr/share/systemtap/tapset/qemu-kvm-simpletrace.stp. ++ ++SystemTap customizations can be made to qemu_kvm.conf to control the flight ++recorder buffer size and whether to store traces in memory only or disk too. ++See stap(1) for option documentation. ++ ++3. Start the systemtap service. ++ # service systemtap start qemu_kvm ++ ++4. Make the service start at boot time. ++ # chkconfig systemtap on ++ ++5. Confirm that the service works. ++ # service systemtap status qemu_kvm ++ qemu_kvm is running... ++ ++When you want to inspect the trace buffer, perform the following steps: ++ ++1. Dump the trace buffer. ++ # staprun -A qemu_kvm >/tmp/trace.log ++ ++2. Start the systemtap service because the preceding step stops the service. ++ # service systemtap start qemu_kvm ++ ++3. Translate the trace record to readable format. ++ # /usr/share/qemu-kvm/simpletrace.py --no-header /usr/share/qemu-kvm/trace-events /tmp/trace.log +diff --git a/scripts/systemtap/conf.d/qemu_kvm.conf b/scripts/systemtap/conf.d/qemu_kvm.conf +new file mode 100644 +index 0000000000..372d8160a4 +--- /dev/null ++++ b/scripts/systemtap/conf.d/qemu_kvm.conf +@@ -0,0 +1,4 @@ ++# Force load uprobes (see BZ#1118352) ++stap -e 'probe process("/usr/libexec/qemu-kvm").function("main") { printf("") }' -c true ++ ++qemu_kvm_OPT="-s4" # per-CPU buffer size, in megabytes +diff --git a/scripts/systemtap/script.d/qemu_kvm.stp b/scripts/systemtap/script.d/qemu_kvm.stp +new file mode 100644 +index 0000000000..c04abf9449 +--- /dev/null ++++ b/scripts/systemtap/script.d/qemu_kvm.stp +@@ -0,0 +1 @@ ++probe qemu.kvm.simpletrace.handle_qmp_command,qemu.kvm.simpletrace.monitor_protocol_*,qemu.kvm.simpletrace.migrate_set_state {} +-- +2.21.0 + diff --git a/0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch b/0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch new file mode 100755 index 0000000..ef83445 --- /dev/null +++ b/0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch @@ -0,0 +1,118 @@ +From 97ed62562b883c384346bfef3e1c7e379f03ccab Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 30 Nov 2018 09:11:03 +0100 +Subject: Use qemu-kvm in documentation instead of qemu-system- + +Patchwork-id: 62380 +O-Subject: [RHEV-7.1 qemu-kvm-rhev PATCHv4] Use qemu-kvm in documentation instead of qemu-system-i386 +Bugzilla: 1140620 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Markus Armbruster +RH-Acked-by: Stefan Hajnoczi + +From: Miroslav Rezanina + +We change the name and location of qemu-kvm binaries. Update documentation +to reflect this change. Only architectures available in RHEL are updated. + +Signed-off-by: Miroslav Rezanina +Signed-off-by: Danilo C. L. de Paula +--- + docs/qemu-block-drivers.texi | 2 +- + docs/qemu-cpu-models.texi | 2 +- + qemu-doc.texi | 6 +++--- + qemu-options.hx | 16 ++++++++-------- + 4 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi +index 2c7ea49c32..5d0afb3dee 100644 +--- a/docs/qemu-block-drivers.texi ++++ b/docs/qemu-block-drivers.texi +@@ -2,7 +2,7 @@ + QEMU block driver reference manual + @c man end + +-@set qemu_system qemu-system-x86_64 ++@set qemu_system qemu-kvm + + @c man begin DESCRIPTION + +diff --git a/docs/qemu-cpu-models.texi b/docs/qemu-cpu-models.texi +index f88a1def0d..c82cf8fab7 100644 +--- a/docs/qemu-cpu-models.texi ++++ b/docs/qemu-cpu-models.texi +@@ -2,7 +2,7 @@ + QEMU / KVM CPU model configuration + @c man end + +-@set qemu_system_x86 qemu-system-x86_64 ++@set qemu_system_x86 qemu-kvm + + @c man begin DESCRIPTION + +diff --git a/qemu-doc.texi b/qemu-doc.texi +index 3ddf5c0a68..d460f8d2c0 100644 +--- a/qemu-doc.texi ++++ b/qemu-doc.texi +@@ -11,8 +11,8 @@ + @paragraphindent 0 + @c %**end of header + +-@set qemu_system qemu-system-x86_64 +-@set qemu_system_x86 qemu-system-x86_64 ++@set qemu_system qemu-kvm ++@set qemu_system_x86 qemu-kvm + + @ifinfo + @direntry +@@ -1827,7 +1827,7 @@ Set the initial VGA graphic mode. The default is 800x600x32. + Set OpenBIOS variables in NVRAM, for example: + + @example +-qemu-system-ppc -prom-env 'auto-boot?=false' \ ++qemu-kvm -prom-env 'auto-boot?=false' \ + -prom-env 'boot-device=hd:2,\yaboot' \ + -prom-env 'boot-args=conf=hd:2,\yaboot.conf' + @end example +diff --git a/qemu-options.hx b/qemu-options.hx +index fc17aca631..df1d27b6f2 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -2737,11 +2737,11 @@ be created for multiqueue vhost-user. + + Example: + @example +-qemu -m 512 -object memory-backend-file,id=mem,size=512M,mem-path=/hugetlbfs,share=on \ +- -numa node,memdev=mem \ +- -chardev socket,id=chr0,path=/path/to/socket \ +- -netdev type=vhost-user,id=net0,chardev=chr0 \ +- -device virtio-net-pci,netdev=net0 ++qemu-kvm -m 512 -object memory-backend-file,id=mem,size=512M,mem-path=/hugetlbfs,share=on \ ++ -numa node,memdev=mem \ ++ -chardev socket,id=chr0,path=/path/to/socket \ ++ -netdev type=vhost-user,id=net0,chardev=chr0 \ ++ -device virtio-net-pci,netdev=net0 + @end example + + @item -netdev hubport,id=@var{id},hubid=@var{hubid}[,netdev=@var{nd}] +@@ -3631,14 +3631,14 @@ ETEXI + + DEF("realtime", HAS_ARG, QEMU_OPTION_realtime, + "-realtime [mlock=on|off]\n" +- " run qemu with realtime features\n" ++ " run qemu-kvm with realtime features\n" + " mlock=on|off controls mlock support (default: on)\n", + QEMU_ARCH_ALL) + STEXI + @item -realtime mlock=on|off + @findex -realtime +-Run qemu with realtime features. +-mlocking qemu and guest memory can be enabled via @option{mlock=on} ++Run qemu-kvm with realtime features. ++mlocking qemu-kvm and guest memory can be enabled via @option{mlock=on} + (enabled by default). + ETEXI + +-- +2.21.0 + diff --git a/0018-usb-xhci-Fix-PCI-capability-order.patch b/0018-usb-xhci-Fix-PCI-capability-order.patch new file mode 100755 index 0000000..bc6146d --- /dev/null +++ b/0018-usb-xhci-Fix-PCI-capability-order.patch @@ -0,0 +1,96 @@ +From b13a7d3527c5c91e7a50236de30a2244b8453911 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 5 May 2017 19:06:14 +0200 +Subject: usb-xhci: Fix PCI capability order + +RH-Author: Dr. David Alan Gilbert +Message-id: <20170505190614.15987-2-dgilbert@redhat.com> +Patchwork-id: 75038 +O-Subject: [RHEL-7.4 qemu-kvm-rhev PATCH 1/1] usb-xhci: Fix PCI capability order +Bugzilla: 1447874 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Juan Quintela + +From: "Dr. David Alan Gilbert" + +Upstream commit 1108b2f8a9 in 2.7.0 changed the order +of the PCI capability chain in the XHCI pci device in the case +where the device has the PCIe endpoint capability (i.e. only +older machine types, pc-i440fx-2.0 upstream, pc-i440fx-rhel7.0.0 +apparently for us). + +Changing the order breaks migration compatibility; fixing this +upstream would mean breaking the same case going from 2.7.0->current +that currently works 2.7.0->2.9.0 - so upstream it's a choice +of two breakages. + +Since we never released 2.7.0/2.8.0 we can fix this downstream. + +This reverts the order so that we create the capabilities in the +order: + PCIe + MSI + MSI-X + +The symptom is: +qemu-kvm: get_pci_config_device: Bad config data: i=0x71 read: a0 device: 0 cmask: ff wmask: 0 w1cmask:0 +qemu-kvm: Failed to load PCIDevice:config +qemu-kvm: Failed to load xhci:parent_obj +qemu-kvm: error while loading state for instance 0x0 of device '0000:00:0d.0/xhci' +qemu-kvm: load of migration failed: Invalid argument + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Miroslav Rezanina + +-- +Rebase notes (2.9.0): +- Change in assert condition (upstream) + +(cherry picked from commit aad727a5ecde1ad4935eb8427604d4df5a1f1f35) +(cherry picked from commit 2dd7402227e77d748a7375233ac9e7feab244bda) + +Conflicts: + hw/usb/hcd-xhci.c + +(cherry picked from commit a42f86dc906cc7d2c16d02bf125ed76847b469cb) +(cherry picked from commit 992ab2e4f6e15d3e51bc716763aa8d6f45c6d29d) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-xhci.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index 8fed2eedd6..d2b9744030 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -3403,6 +3403,12 @@ static void usb_xhci_realize(struct PCIDevice *dev, Error **errp) + xhci->max_pstreams_mask = 0; + } + ++ if (pci_bus_is_express(pci_get_bus(dev)) || ++ xhci_get_flag(xhci, XHCI_FLAG_FORCE_PCIE_ENDCAP)) { ++ ret = pcie_endpoint_cap_init(dev, 0xa0); ++ assert(ret > 0); ++ } ++ + if (xhci->msi != ON_OFF_AUTO_OFF) { + ret = msi_init(dev, 0x70, xhci->numintrs, true, false, &err); + /* Any error other than -ENOTSUP(board's MSI support is broken) +@@ -3451,12 +3457,6 @@ static void usb_xhci_realize(struct PCIDevice *dev, Error **errp) + PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64, + &xhci->mem); + +- if (pci_bus_is_express(pci_get_bus(dev)) || +- xhci_get_flag(xhci, XHCI_FLAG_FORCE_PCIE_ENDCAP)) { +- ret = pcie_endpoint_cap_init(dev, 0xa0); +- assert(ret > 0); +- } +- + if (xhci->msix != ON_OFF_AUTO_OFF) { + /* TODO check for errors, and should fail when msix=on */ + msix_init(dev, xhci->numintrs, +-- +2.21.0 + diff --git a/0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch b/0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch new file mode 100755 index 0000000..e167b2e --- /dev/null +++ b/0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch @@ -0,0 +1,69 @@ +From 3fab8f5e8a9e190c1ed6916ac13c7c4d65e874b7 Mon Sep 17 00:00:00 2001 +From: Fam Zheng +Date: Wed, 14 Jun 2017 15:37:01 +0200 +Subject: virtio-scsi: Reject scsi-cd if data plane enabled [RHEL only] + +RH-Author: Fam Zheng +Message-id: <20170614153701.14757-1-famz@redhat.com> +Patchwork-id: 75613 +O-Subject: [RHV-7.4 qemu-kvm-rhev PATCH v3] virtio-scsi: Reject scsi-cd if data plane enabled [RHEL only] +Bugzilla: 1378816 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +We need a fix for RHEL 7.4 and 7.3.z, but unfortunately upstream isn't +ready. If it were, the changes will be too invasive. To have an idea: + +https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg05400.html + +is an incomplete attempt to fix part of the issue, and the remaining +work unfortunately involve even more complex changes. + +As a band-aid, this partially reverts the effect of ef8875b +(virtio-scsi: Remove op blocker for dataplane, since v2.7). We cannot +simply revert that commit as a whole because we already shipped it in +qemu-kvm-rhev 7.3, since when, block jobs has been possible. We should +only block what has been broken. Also, faithfully reverting the above +commit means adding back the removed op blocker, but that is not enough, +because it still crashes when inserting media into an initially empty +scsi-cd. + +All in all, scsi-cd on virtio-scsi-dataplane has basically been unusable +unless the scsi-cd never enters an empty state, so, disable it +altogether. Otherwise it would be much more difficult to avoid +crashing. + +Signed-off-by: Fam Zheng +Signed-off-by: Miroslav Rezanina +(cherry picked from commit b0caf00bbc35c7d89e02999bdce86e1f867728e8) +(cherry picked from commit c9c4f117d8b507c2f86035c282d537c0a327364f) +(cherry picked from commit 5d586bb2543337f0ff172c6ce942dba3acbcedff) +Signed-off-by: Danilo C. L. de Paula +--- + hw/scsi/virtio-scsi.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c +index e8b2b64d09..54108c0056 100644 +--- a/hw/scsi/virtio-scsi.c ++++ b/hw/scsi/virtio-scsi.c +@@ -808,6 +808,15 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev, + SCSIDevice *sd = SCSI_DEVICE(dev); + int ret; + ++ /* XXX: Remove this check once block backend is capable of handling ++ * AioContext change upon eject/insert. ++ * s->ctx is NULL if ioeventfd is off, s->ctx is qemu_get_aio_context() if ++ * data plane is not used, both cases are safe for scsi-cd. */ ++ if (s->ctx && s->ctx != qemu_get_aio_context() && ++ object_dynamic_cast(OBJECT(dev), "scsi-cd")) { ++ error_setg(errp, "scsi-cd is not supported by data plane"); ++ return; ++ } + if (s->ctx && !s->dataplane_fenced) { + if (blk_op_is_blocked(sd->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) { + return; +-- +2.21.0 + diff --git a/0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch b/0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch new file mode 100755 index 0000000..b3350da --- /dev/null +++ b/0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch @@ -0,0 +1,60 @@ +From 148e9e80a3a430615b552075082fad22d007d851 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Wed, 6 Feb 2019 03:58:56 +0000 +Subject: BZ1653590: Require at least 64kiB pages for downstream guests & hosts + +RH-Author: David Gibson +Message-id: <20190206035856.19058-1-dgibson@redhat.com> +Patchwork-id: 84246 +O-Subject: [RHELAV-8.0/rhel qemu-kvm PATCH] BZ1653590: Require at least 64kiB pages for downstream guests & hosts +Bugzilla: 1653590 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Serhii Popovych +RH-Acked-by: Thomas Huth + +Most current POWER guests require 64kiB page support, so that's the default +for the cap-hpt-max-pagesize option in qemu which limits available guest +page sizes. We warn if the value is set smaller than that, but don't +outright fail upstream, because we need to allow for the possibility of +guest (and/or host) kernels configured for 4kiB page sizes. + +Downstream, however, we simply don't support 4kiB pagesize configured +kernels in guest or host, so we can have qemu simply error out in this +situation. + +Testing: Attempted to start a guest with cap-hpt-max-page-size=4k and verified + it failed immediately with a qemu error + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_caps.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c +index 481dfd2a27..805f38533e 100644 +--- a/hw/ppc/spapr_caps.c ++++ b/hw/ppc/spapr_caps.c +@@ -351,12 +351,19 @@ void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, + static void cap_hpt_maxpagesize_apply(SpaprMachineState *spapr, + uint8_t val, Error **errp) + { ++#if 0 /* disabled for RHEL */ + if (val < 12) { + error_setg(errp, "Require at least 4kiB hpt-max-page-size"); + return; + } else if (val < 16) { + warn_report("Many guests require at least 64kiB hpt-max-page-size"); + } ++#else /* Only page sizes >=64kiB supported for RHEL */ ++ if (val < 16) { ++ error_setg(errp, "Require at least 64kiB hpt-max-page-size"); ++ return; ++ } ++#endif + + spapr_check_pagesize(spapr, qemu_minrampagesize(), errp); + } +-- +2.21.0 + diff --git a/0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch b/0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch new file mode 100755 index 0000000..a2a800b --- /dev/null +++ b/0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch @@ -0,0 +1,61 @@ +From ab9ebc29bb9bb142e73a160750a451d40bfe9746 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Mon, 16 Sep 2019 17:07:00 +0100 +Subject: Using ip_deq after m_free might read pointers from an allocation + reuse. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Philippe Mathieu-Daudé +Message-id: <20190916170700.647-2-philmd@redhat.com> +Patchwork-id: 90470 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH 1/1] Using ip_deq after m_free might read pointers from an allocation reuse. +Bugzilla: 1749737 +RH-Acked-by: Danilo de Paula +RH-Acked-by: John Snow + +From: Samuel Thibault + +This would be difficult to exploit, but that is still related with +CVE-2019-14378 which generates fragmented IP packets that would trigger this +issue and at least produce a DoS. + +Signed-off-by: Samuel Thibault +(cherry picked from libslirp commit c59279437eda91841b9d26079c70b8a540d41204) +Signed-off-by: Philippe Mathieu-Daudé + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ip_input.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/slirp/src/ip_input.c b/slirp/src/ip_input.c +index 8c75d91495..df1c846ade 100644 +--- a/slirp/src/ip_input.c ++++ b/slirp/src/ip_input.c +@@ -292,6 +292,7 @@ static struct ip *ip_reass(Slirp *slirp, struct ip *ip, struct ipq *fp) + */ + while (q != (struct ipasfrag *)&fp->frag_link && + ip->ip_off + ip->ip_len > q->ipf_off) { ++ struct ipasfrag *prev; + i = (ip->ip_off + ip->ip_len) - q->ipf_off; + if (i < q->ipf_len) { + q->ipf_len -= i; +@@ -299,9 +300,11 @@ static struct ip *ip_reass(Slirp *slirp, struct ip *ip, struct ipq *fp) + m_adj(dtom(slirp, q), i); + break; + } ++ prev = q; + q = q->ipf_next; +- m_free(dtom(slirp, q->ipf_prev)); +- ip_deq(q->ipf_prev); ++ ip_deq(prev); ++ m_free(dtom(slirp, prev)); ++ + } + + insert: +-- +2.21.0 + diff --git a/81-kvm-rhel.rules b/81-kvm-rhel.rules new file mode 100755 index 0000000..787cad6 --- /dev/null +++ b/81-kvm-rhel.rules @@ -0,0 +1 @@ +DEVPATH=="*/kvm", ACTION=="change", RUN+="/lib/udev/udev-kvm-check $env{COUNT} $env{EVENT}" diff --git a/85-kvm.preset b/85-kvm.preset new file mode 100755 index 0000000..8024052 --- /dev/null +++ b/85-kvm.preset @@ -0,0 +1,5 @@ +# Enable kvm-setup by default. This can have odd side effects on +# PowerNV systems that aren't intended as KVM hosts, but at present we +# only support RHEL on PowerNV for the purpose of being a RHEV host. + +enable kvm-setup.service diff --git a/95-kvm-memlock.conf b/95-kvm-memlock.conf new file mode 100755 index 0000000..fc59dbe --- /dev/null +++ b/95-kvm-memlock.conf @@ -0,0 +1,10 @@ +# The KVM HV implementation on Power can require a significant amount +# of unswappable memory (about half of which also needs to be host +# physically contiguous) to hold the guest's Hash Page Table (HPT) - +# roughly 1/64th of the guest's RAM size, minimum 16MiB. +# +# These limits allow unprivileged users to start smallish VMs, such as +# those used by libguestfs. +# +* hard memlock 65536 +* soft memlock 65536 diff --git a/99-qemu-guest-agent.rules b/99-qemu-guest-agent.rules new file mode 100755 index 0000000..8a290ab --- /dev/null +++ b/99-qemu-guest-agent.rules @@ -0,0 +1,2 @@ +SUBSYSTEM=="virtio-ports", ATTR{name}=="org.qemu.guest_agent.0", \ + TAG+="systemd" ENV{SYSTEMD_WANTS}="qemu-guest-agent.service" diff --git a/README.md b/README.md deleted file mode 100644 index 7342728..0000000 --- a/README.md +++ /dev/null @@ -1,11 +0,0 @@ -Anolis OS -======================================= -# 代码仓库说明 -## 分支说明 ->进行代码开发工作时,请注意选择当前版本对应的分支 -* aX分支为对应大版本的主分支,如a8分支对应当前最新版本 -* aX.Y分支为对应小版本的维护分支,如a8.2分支对应8.2版本 -## 开发流程 -1. 首先fork目标分支到自己的namespace -2. 在自己的fork分支上做出修改 -3. 向对应的仓库中提交merge request,源分支为fork分支 diff --git a/README.tests b/README.tests new file mode 100755 index 0000000..9932773 --- /dev/null +++ b/README.tests @@ -0,0 +1,39 @@ +qemu-kvm-tests README +===================== + +The qemu-kvm-tests rpm contains tests that can be used to verify the +functionality of the installed qemu-kvm package + +When installed, the files from this rpm will be arranged in the following +directory structure + +tests-src/ +├── README +├── scripts +│   ├── qemu.py +│   └── qmp +└── tests + ├── acceptance + ├── Makefile.include + └── qemu-iotests + +The tests/ directory within the tests-src/ directory is setup to remain a copy +of a subset of the tests/ directory from the QEMU source tree + +The avocado_qemu tests and qemu-iotests, along with files required for the +execution of the avocado_qemu tests (scripts/qemu.py and scripts/qmp/) will be +installed in a new location - /usr/lib64/qemu-kvm/tests-src/ + +avocado_qemu tests: +The avocado_qemu tests can be executed by running the following avocado command: +avocado run -p qemu_bin=/usr/libexec/qemu-kvm /usr/lib64/qemu-kvm/tests/acceptance/ +Avocado needs to be installed separately using either pip or from source as +Avocado is not being packaged for RHEL-8. + +qemu-iotests: +symlinks to corresponding binaries need to be created for QEMU_PROG, +QEMU_IO_PROG, QEMU_IMG_PROG, and QEMU_NBD_PROG before the iotests can be +executed. + +The primary purpose of this package is to make these tests available to be +executed as gating tests for the virt module in the RHEL-8 OSCI environment. diff --git a/bridge.conf b/bridge.conf new file mode 100755 index 0000000..a573665 --- /dev/null +++ b/bridge.conf @@ -0,0 +1 @@ +allow virbr0 diff --git a/ksm.service b/ksm.service new file mode 100755 index 0000000..35c6f1d --- /dev/null +++ b/ksm.service @@ -0,0 +1,13 @@ +[Unit] +Description=Kernel Samepage Merging +ConditionPathExists=/sys/kernel/mm/ksm + +[Service] +Type=oneshot +RemainAfterExit=yes +EnvironmentFile=-/etc/sysconfig/ksm +ExecStart=/usr/libexec/ksmctl start +ExecStop=/usr/libexec/ksmctl stop + +[Install] +WantedBy=multi-user.target diff --git a/ksm.sysconfig b/ksm.sysconfig new file mode 100755 index 0000000..d99656d --- /dev/null +++ b/ksm.sysconfig @@ -0,0 +1,4 @@ +# The maximum number of unswappable kernel pages +# which may be allocated by ksm (0 for unlimited) +# If unset, defaults to half of total memory +# KSM_MAX_KERNEL_PAGES= diff --git a/ksmctl.c b/ksmctl.c new file mode 100755 index 0000000..af39591 --- /dev/null +++ b/ksmctl.c @@ -0,0 +1,77 @@ +/* Start/stop KSM, for systemd. + * Copyright (C) 2009, 2011 Red Hat, Inc. + * Written by Paolo Bonzini . + * Based on the original sysvinit script by Dan Kenigsberg + * This file is distributed under the GNU General Public License, version 2 + * or later. */ + +#include +#include +#include +#include +#include +#include + +#define KSM_MAX_KERNEL_PAGES_FILE "/sys/kernel/mm/ksm/max_kernel_pages" +#define KSM_RUN_FILE "/sys/kernel/mm/ksm/run" + +char *program_name; + +int usage(void) +{ + fprintf(stderr, "Usage: %s {start|stop}\n", program_name); + return 1; +} + +int write_value(uint64_t value, char *filename) +{ + FILE *fp; + if (!(fp = fopen(filename, "w")) || + fprintf(fp, "%llu\n", (unsigned long long) value) == EOF || + fflush(fp) == EOF || + fclose(fp) == EOF) + return 1; + + return 0; +} + +uint64_t ksm_max_kernel_pages() +{ + char *var = getenv("KSM_MAX_KERNEL_PAGES"); + char *endptr; + uint64_t value; + if (var && *var) { + value = strtoll(var, &endptr, 0); + if (value < LLONG_MAX && !*endptr) + return value; + } + /* Unless KSM_MAX_KERNEL_PAGES is set, let KSM munch up to half of + * total memory. */ + return sysconf(_SC_PHYS_PAGES) / 2; +} + +int start(void) +{ + if (access(KSM_MAX_KERNEL_PAGES_FILE, R_OK) >= 0) + write_value(ksm_max_kernel_pages(), KSM_MAX_KERNEL_PAGES_FILE); + return write_value(1, KSM_RUN_FILE); +} + +int stop(void) +{ + return write_value(0, KSM_RUN_FILE); +} + +int main(int argc, char **argv) +{ + program_name = argv[0]; + if (argc < 2) { + return usage(); + } else if (!strcmp(argv[1], "start")) { + return start(); + } else if (!strcmp(argv[1], "stop")) { + return stop(); + } else { + return usage(); + } +} diff --git a/ksmtuned b/ksmtuned new file mode 100755 index 0000000..7bc5743 --- /dev/null +++ b/ksmtuned @@ -0,0 +1,139 @@ +#!/bin/bash +# +# Copyright 2009 Red Hat, Inc. and/or its affiliates. +# Released under the GPL +# +# Author: Dan Kenigsberg +# +# ksmtuned - a simple script that controls whether (and with what vigor) ksm +# should search for duplicated pages. +# +# starts ksm when memory commited to qemu processes exceeds a threshold, and +# make ksm work harder and harder untill memory load falls below that +# threshold. +# +# send SIGUSR1 to this process right after a new qemu process is started, or +# following its death, to retune ksm accordingly +# +# needs testing and ironing. contact danken@redhat.com if something breaks. + +if [ -f /etc/ksmtuned.conf ]; then + . /etc/ksmtuned.conf +fi + +debug() { + if [ -n "$DEBUG" ]; then + s="`/bin/date`: $*" + [ -n "$LOGFILE" ] && echo "$s" >> "$LOGFILE" || echo "$s" + fi +} + + +KSM_MONITOR_INTERVAL=${KSM_MONITOR_INTERVAL:-60} +KSM_NPAGES_BOOST=${KSM_NPAGES_BOOST:-300} +KSM_NPAGES_DECAY=${KSM_NPAGES_DECAY:--50} + +KSM_NPAGES_MIN=${KSM_NPAGES_MIN:-64} +KSM_NPAGES_MAX=${KSM_NPAGES_MAX:-1250} +# millisecond sleep between ksm scans for 16Gb server. Smaller servers sleep +# more, bigger sleep less. +KSM_SLEEP_MSEC=${KSM_SLEEP_MSEC:-10} + +KSM_THRES_COEF=${KSM_THRES_COEF:-20} +KSM_THRES_CONST=${KSM_THRES_CONST:-2048} + +total=`awk '/^MemTotal:/ {print $2}' /proc/meminfo` +debug total $total + +npages=0 +sleep=$[KSM_SLEEP_MSEC * 16 * 1024 * 1024 / total] +[ $sleep -le 10 ] && sleep=10 +debug sleep $sleep +thres=$[total * KSM_THRES_COEF / 100] +if [ $KSM_THRES_CONST -gt $thres ]; then + thres=$KSM_THRES_CONST +fi +debug thres $thres + +KSMCTL () { + case x$1 in + xstop) + echo 0 > /sys/kernel/mm/ksm/run + ;; + xstart) + echo $2 > /sys/kernel/mm/ksm/pages_to_scan + echo $3 > /sys/kernel/mm/ksm/sleep_millisecs + echo 1 > /sys/kernel/mm/ksm/run + ;; + esac +} + +committed_memory () { + # calculate how much memory is committed to running qemu processes + local pidlist + pidlist=$(pgrep -d ' ' -- '^qemu(-(kvm|system-.+)|:.{1,11})$') + if [ -n "$pidlist" ]; then + ps -p "$pidlist" -o rsz= + fi | awk '{ sum += $1 }; END { print 0+sum }' +} + +free_memory () { + awk '/^(MemFree|Buffers|Cached):/ {free += $2}; END {print free}' \ + /proc/meminfo +} + +increase_npages() { + local delta + delta=${1:-0} + npages=$[npages + delta] + if [ $npages -lt $KSM_NPAGES_MIN ]; then + npages=$KSM_NPAGES_MIN + elif [ $npages -gt $KSM_NPAGES_MAX ]; then + npages=$KSM_NPAGES_MAX + fi + echo $npages +} + + +adjust () { + local free committed + free=`free_memory` + committed=`committed_memory` + debug committed $committed free $free + if [ $[committed + thres] -lt $total -a $free -gt $thres ]; then + KSMCTL stop + debug "$[committed + thres] < $total and free > $thres, stop ksm" + return 1 + fi + debug "$[committed + thres] > $total, start ksm" + if [ $free -lt $thres ]; then + npages=`increase_npages $KSM_NPAGES_BOOST` + debug "$free < $thres, boost" + else + npages=`increase_npages $KSM_NPAGES_DECAY` + debug "$free > $thres, decay" + fi + KSMCTL start $npages $sleep + debug "KSMCTL start $npages $sleep" + return 0 +} + +function nothing () { + : +} + +loop () { + trap nothing SIGUSR1 + while true + do + sleep $KSM_MONITOR_INTERVAL & + wait $! + adjust + done +} + +PIDFILE=${PIDFILE-/var/run/ksmtune.pid} +if touch "$PIDFILE"; then + loop & + echo $! > "$PIDFILE" +fi diff --git a/ksmtuned.conf b/ksmtuned.conf new file mode 100755 index 0000000..fc4518c --- /dev/null +++ b/ksmtuned.conf @@ -0,0 +1,21 @@ +# Configuration file for ksmtuned. + +# How long ksmtuned should sleep between tuning adjustments +# KSM_MONITOR_INTERVAL=60 + +# Millisecond sleep between ksm scans for 16Gb server. +# Smaller servers sleep more, bigger sleep less. +# KSM_SLEEP_MSEC=10 + +# KSM_NPAGES_BOOST=300 +# KSM_NPAGES_DECAY=-50 +# KSM_NPAGES_MIN=64 +# KSM_NPAGES_MAX=1250 + +# KSM_THRES_COEF=20 +# KSM_THRES_CONST=2048 + +# uncomment the following if you want ksmtuned debug info + +# LOGFILE=/var/log/ksmtuned +# DEBUG=1 diff --git a/ksmtuned.service b/ksmtuned.service new file mode 100755 index 0000000..39febcc --- /dev/null +++ b/ksmtuned.service @@ -0,0 +1,12 @@ +[Unit] +Description=Kernel Samepage Merging (KSM) Tuning Daemon +After=ksm.service +Requires=ksm.service + +[Service] +ExecStart=/usr/sbin/ksmtuned +ExecReload=/bin/kill -USR1 $MAINPID +Type=forking + +[Install] +WantedBy=multi-user.target diff --git a/kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch b/kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch new file mode 100755 index 0000000..7310f17 --- /dev/null +++ b/kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch @@ -0,0 +1,41 @@ +From ff8529dcbf86b3a086d64dd630cf6a687603c571 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:55 +0100 +Subject: [PATCH 12/12] ACPI: add expected files for HMAT tests (acpihmat) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-12-plai@redhat.com> +Patchwork-id: 96742 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 11/11] ACPI: add expected files for HMAT tests (acpihmat) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: "Michael S. Tsirkin" + +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 48892c6c8def6624a0ed57e2bd6c2a0a9878b973) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + tests/bios-tables-test-allowed-diff.h | 8 -------- + 1 file changed, 8 deletions(-) + +diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h +index 3c9e0c9..dfb8523 100644 +--- a/tests/bios-tables-test-allowed-diff.h ++++ b/tests/bios-tables-test-allowed-diff.h +@@ -1,9 +1 @@ + /* List of comma-separated changed AML files to ignore */ +-"tests/data/acpi/pc/APIC.acpihmat", +-"tests/data/acpi/pc/SRAT.acpihmat", +-"tests/data/acpi/pc/HMAT.acpihmat", +-"tests/data/acpi/pc/DSDT.acpihmat", +-"tests/data/acpi/q35/APIC.acpihmat", +-"tests/data/acpi/q35/SRAT.acpihmat", +-"tests/data/acpi/q35/HMAT.acpihmat", +-"tests/data/acpi/q35/DSDT.acpihmat", +-- +1.8.3.1 + diff --git a/kvm-Add-mtod_check.patch b/kvm-Add-mtod_check.patch new file mode 100755 index 0000000..0b2e710 --- /dev/null +++ b/kvm-Add-mtod_check.patch @@ -0,0 +1,68 @@ +From 52bf635da30c75d0fdb0a3e7e7b9a2483ca033fc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:59 -0400 +Subject: [PATCH 05/14] Add mtod_check() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-2-marcandre.lureau@redhat.com> +Patchwork-id: 101819 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/8] Add mtod_check() +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Recent security issues demonstrate the lack of safety care when casting +a mbuf to a particular structure type. At least, it should check that +the buffer is large enough. The following patches will make use of this +function. + +Signed-off-by: Marc-André Lureau + +(cherry picked from commit 93e645e72a056ec0b2c16e0299fc5c6b94e4ca17) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/mbuf.c | 11 +++++++++++ + slirp/src/mbuf.h | 1 + + 2 files changed, 12 insertions(+) + +diff --git a/slirp/src/mbuf.c b/slirp/src/mbuf.c +index 4fd62282a9..6d0653ed3d 100644 +--- a/slirp/src/mbuf.c ++++ b/slirp/src/mbuf.c +@@ -222,3 +222,14 @@ struct mbuf *dtom(Slirp *slirp, void *dat) + + return (struct mbuf *)0; + } ++ ++void *mtod_check(struct mbuf *m, size_t len) ++{ ++ if (m->m_len >= len) { ++ return m->m_data; ++ } ++ ++ DEBUG_ERROR("mtod failed"); ++ ++ return NULL; ++} +diff --git a/slirp/src/mbuf.h b/slirp/src/mbuf.h +index 546e7852c5..2015e3232f 100644 +--- a/slirp/src/mbuf.h ++++ b/slirp/src/mbuf.h +@@ -118,6 +118,7 @@ void m_inc(struct mbuf *, int); + void m_adj(struct mbuf *, int); + int m_copy(struct mbuf *, struct mbuf *, int, int); + struct mbuf *dtom(Slirp *, void *); ++void *mtod_check(struct mbuf *, size_t len); + + static inline void ifs_init(struct mbuf *ifm) + { +-- +2.27.0 + diff --git a/kvm-Compress-lines-for-immediate-return.patch b/kvm-Compress-lines-for-immediate-return.patch new file mode 100755 index 0000000..aed5149 --- /dev/null +++ b/kvm-Compress-lines-for-immediate-return.patch @@ -0,0 +1,242 @@ +From 5cf6dd33456c4e7e2a8849f458ce234fb5bb290c Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 25 Jun 2021 17:41:03 -0400 +Subject: [PATCH 3/4] Compress lines for immediate return +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Kevin Wolf +Message-id: <20210625174104.44313-2-kwolf@redhat.com> +Patchwork-id: 101777 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] Compress lines for immediate return +Bugzilla: 1970912 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Simran Singhal + +Compress two lines into a single line if immediate return statement is found. + +It also remove variables progress, val, data, ret and sock +as they are no longer needed. + +Remove space between function "mixer_load" and '(' to fix the +checkpatch.pl error:- +ERROR: space prohibited between function name and open parenthesis '(' + +Done using following coccinelle script: +@@ +local idexpression ret; +expression e; +@@ + +-ret = ++return + e; +-return ret; + +Signed-off-by: Simran Singhal +Reviewed-by: Stefan Hajnoczi +Message-Id: <20200401165314.GA3213@simran-Inspiron-5558> +[lv: in handle_aiocb_write_zeroes_unmap() move "int ret" inside the #ifdef] +Signed-off-by: Laurent Vivier +(cherry picked from commit b3ac2b94cdc939a90d5a22338ae507689e2cfab0) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 8 +++----- + block/nfs.c | 3 +-- + block/nvme.c | 4 +--- + block/vhdx.c | 3 +-- + hw/audio/ac97.c | 4 +--- + hw/audio/adlib.c | 5 +---- + hw/display/cirrus_vga.c | 4 +--- + migration/ram.c | 4 +--- + ui/gtk.c | 3 +-- + util/qemu-sockets.c | 5 +---- + 10 files changed, 12 insertions(+), 31 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 371572f1b0..837edcf027 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1626,13 +1626,12 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) + { + RawPosixAIOData *aiocb = opaque; + BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque; +- int ret; + + /* First try to write zeros and unmap at the same time */ + + #ifdef CONFIG_FALLOCATE_PUNCH_HOLE +- ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, +- aiocb->aio_offset, aiocb->aio_nbytes); ++ int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, ++ aiocb->aio_offset, aiocb->aio_nbytes); + if (ret != -ENOTSUP) { + return ret; + } +@@ -1640,8 +1639,7 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) + + /* If we couldn't manage to unmap while guaranteed that the area reads as + * all-zero afterwards, just write zeroes without unmapping */ +- ret = handle_aiocb_write_zeroes(aiocb); +- return ret; ++ return handle_aiocb_write_zeroes(aiocb); + } + + #ifndef HAVE_COPY_FILE_RANGE +diff --git a/block/nfs.c b/block/nfs.c +index 2393fbfe6b..18c0a73694 100644 +--- a/block/nfs.c ++++ b/block/nfs.c +@@ -623,8 +623,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, + } + + bs->total_sectors = ret; +- ret = 0; +- return ret; ++ return 0; + } + + static QemuOptsList nfs_create_opts = { +diff --git a/block/nvme.c b/block/nvme.c +index 7b7c0cc5d6..eb2f54dd9d 100644 +--- a/block/nvme.c ++++ b/block/nvme.c +@@ -575,11 +575,9 @@ static bool nvme_poll_cb(void *opaque) + { + EventNotifier *e = opaque; + BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); +- bool progress = false; + + trace_nvme_poll_cb(s); +- progress = nvme_poll_queues(s); +- return progress; ++ return nvme_poll_queues(s); + } + + static int nvme_init(BlockDriverState *bs, const char *device, int namespace, +diff --git a/block/vhdx.c b/block/vhdx.c +index 21497f7318..a427e47f10 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -411,8 +411,7 @@ int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, + if (ret < 0) { + return ret; + } +- ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); +- return ret; ++ return vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + } + + /* opens the specified header block from the VHDX file header section */ +diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c +index a136b97f68..a2cfae52b3 100644 +--- a/hw/audio/ac97.c ++++ b/hw/audio/ac97.c +@@ -574,11 +574,9 @@ static uint32_t nam_readb (void *opaque, uint32_t addr) + static uint32_t nam_readw (void *opaque, uint32_t addr) + { + AC97LinkState *s = opaque; +- uint32_t val = ~0U; + uint32_t index = addr; + s->cas = 0; +- val = mixer_load (s, index); +- return val; ++ return mixer_load(s, index); + } + + static uint32_t nam_readl (void *opaque, uint32_t addr) +diff --git a/hw/audio/adlib.c b/hw/audio/adlib.c +index cb4178d861..5779d09815 100644 +--- a/hw/audio/adlib.c ++++ b/hw/audio/adlib.c +@@ -120,13 +120,10 @@ static void adlib_write(void *opaque, uint32_t nport, uint32_t val) + static uint32_t adlib_read(void *opaque, uint32_t nport) + { + AdlibState *s = opaque; +- uint8_t data; + int a = nport & 3; + + adlib_kill_timers (s); +- data = OPLRead (s->opl, a); +- +- return data; ++ return OPLRead (s->opl, a); + } + + static void timer_handler (void *opaque, int c, double interval_Sec) +diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c +index 93afa26fda..a52d3094b9 100644 +--- a/hw/display/cirrus_vga.c ++++ b/hw/display/cirrus_vga.c +@@ -2411,12 +2411,10 @@ static uint64_t cirrus_linear_bitblt_read(void *opaque, + unsigned size) + { + CirrusVGAState *s = opaque; +- uint32_t ret; + + /* XXX handle bitblt */ + (void)s; +- ret = 0xff; +- return ret; ++ return 0xff; + } + + static void cirrus_linear_bitblt_write(void *opaque, +diff --git a/migration/ram.c b/migration/ram.c +index 5344c7d59e..92c506d13c 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3101,9 +3101,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms) + } + trace_ram_postcopy_send_discard_bitmap(); + +- ret = postcopy_each_ram_send_discard(ms); +- +- return ret; ++ return postcopy_each_ram_send_discard(ms); + } + + /** +diff --git a/ui/gtk.c b/ui/gtk.c +index 692ccc7bbb..e032e3c36f 100644 +--- a/ui/gtk.c ++++ b/ui/gtk.c +@@ -1649,8 +1649,7 @@ static GSList *gd_vc_menu_init(GtkDisplayState *s, VirtualConsole *vc, + G_CALLBACK(gd_menu_switch_vc), s); + gtk_menu_shell_append(GTK_MENU_SHELL(view_menu), vc->menu_item); + +- group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(vc->menu_item)); +- return group; ++ return gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(vc->menu_item)); + } + + #if defined(CONFIG_VTE) +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index bcc06d0e01..86c48b9fa5 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -765,15 +765,12 @@ static int vsock_connect_addr(const struct sockaddr_vm *svm, Error **errp) + static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp) + { + struct sockaddr_vm svm; +- int sock = -1; + + if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) { + return -1; + } + +- sock = vsock_connect_addr(&svm, errp); +- +- return sock; ++ return vsock_connect_addr(&svm, errp); + } + + static int vsock_listen_saddr(VsockSocketAddress *vaddr, +-- +2.27.0 + diff --git a/kvm-Don-t-leak-memory-when-reallocation-fails.patch b/kvm-Don-t-leak-memory-when-reallocation-fails.patch new file mode 100755 index 0000000..5747672 --- /dev/null +++ b/kvm-Don-t-leak-memory-when-reallocation-fails.patch @@ -0,0 +1,58 @@ +From bcb6107f98d7b1edf687d7afd552a4528b7e673b Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Tue, 12 May 2020 21:15:13 +0100 +Subject: [PATCH 2/7] Don't leak memory when reallocation fails. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200512211514.1398384-2-jmaloy@redhat.com> +Patchwork-id: 96412 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] Don't leak memory when reallocation fails. +Bugzilla: 1749737 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Jindrich Novy + +Signed-off-by: Jindrich Novy +[ Marc-André - modified to use a temporary variable ] +Signed-off-by: Marc-André Lureau +(cherry picked from libslirp commit d171af3732a0610a25334b06b77fa547bd677918) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/sbuf.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/slirp/src/sbuf.c b/slirp/src/sbuf.c +index abced48..0569c34 100644 +--- a/slirp/src/sbuf.c ++++ b/slirp/src/sbuf.c +@@ -39,13 +39,16 @@ void sbreserve(struct sbuf *sb, int size) + if (sb->sb_data) { + /* Already alloced, realloc if necessary */ + if (sb->sb_datalen != size) { +- sb->sb_wptr = sb->sb_rptr = sb->sb_data = +- (char *)realloc(sb->sb_data, size); ++ char *new = realloc(sb->sb_data, size); + sb->sb_cc = 0; +- if (sb->sb_wptr) ++ if (new) { ++ sb->sb_data = sb->sb_wptr = sb->sb_rptr = new; + sb->sb_datalen = size; +- else ++ } else { ++ free(sb->sb_data); ++ sb->sb_data = sb->sb_wptr = sb->sb_rptr = NULL; + sb->sb_datalen = 0; ++ } + } + } else { + sb->sb_wptr = sb->sb_rptr = sb->sb_data = (char *)malloc(size); +-- +1.8.3.1 + diff --git a/kvm-Drop-bogus-IPv6-messages.patch b/kvm-Drop-bogus-IPv6-messages.patch new file mode 100755 index 0000000..4c30a3b --- /dev/null +++ b/kvm-Drop-bogus-IPv6-messages.patch @@ -0,0 +1,51 @@ +From 89c4300c97739aa3291f0322037bb65068e08d41 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 19 Jan 2021 23:34:33 -0500 +Subject: [PATCH] Drop bogus IPv6 messages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210119233433.1352902-2-jmaloy@redhat.com> +Patchwork-id: 100695 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] Drop bogus IPv6 messages +Bugzilla: 1918054 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Ralf Haferkamp + +Drop IPv6 message shorter than what's mentioned in the payload +length header (+ the size of the IPv6 header). They're invalid an could +lead to data leakage in icmp6_send_echoreply(). + +(cherry picked from libslirp commit c7ede54cbd2e2b25385325600958ba0124e31cc0) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ip6_input.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/slirp/src/ip6_input.c b/slirp/src/ip6_input.c +index d9d2b7e9cd4..0f2b17853ad 100644 +--- a/slirp/src/ip6_input.c ++++ b/slirp/src/ip6_input.c +@@ -49,6 +49,13 @@ void ip6_input(struct mbuf *m) + goto bad; + } + ++ // Check if the message size is big enough to hold what's ++ // set in the payload length header. If not this is an invalid ++ // packet ++ if (m->m_len < ntohs(ip6->ip_pl) + sizeof(struct ip6)) { ++ goto bad; ++ } ++ + /* check ip_ttl for a correct ICMP reply */ + if (ip6->ip_hl == 0) { + icmp6_send_error(m, ICMP6_TIMXCEED, ICMP6_TIMXCEED_INTRANS); +-- +2.27.0 + diff --git a/kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch b/kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch new file mode 100755 index 0000000..2dd4457 --- /dev/null +++ b/kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch @@ -0,0 +1,59 @@ +From d0c668aa0ad255c3598267816154874541ac2943 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:42 -0400 +Subject: [PATCH 12/14] Fix "DHCP broken in libslirp v4.6.0" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-9-marcandre.lureau@redhat.com> +Patchwork-id: 101824 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 8/8] Fix "DHCP broken in libslirp v4.6.0" +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Akihiro Suda + +Fix issue 48 + +Signed-off-by: Akihiro Suda + +(cherry picked from commit c9f314f6e315a5518432761fea864196a290f799) +[ minor conflict fix due to indentation change ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/bootp.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/slirp/src/bootp.c b/slirp/src/bootp.c +index 5789187166..3e4af075f1 100644 +--- a/slirp/src/bootp.c ++++ b/slirp/src/bootp.c +@@ -354,14 +354,14 @@ static void bootp_reply(Slirp *slirp, + q += sizeof(nak_msg) - 1; + } + assert(q < end); +- *q = +-RFC1533_END +-; ++ *q = RFC1533_END; + +-daddr.sin_addr.s_addr = 0xffffffffu; ++ daddr.sin_addr.s_addr = 0xffffffffu; + +-m->m_len = sizeof(struct bootp_t) - sizeof(struct ip) - sizeof(struct udphdr); +-udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); ++ assert ((q - rbp->bp_vend + 1) <= DHCP_OPT_LEN); ++ ++ m->m_len = sizeof(struct bootp_t) + (q - rbp->bp_vend + 1) - sizeof(struct ip) - sizeof(struct udphdr); ++ udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + } + + void bootp_input(struct mbuf *m) +-- +2.27.0 + diff --git a/kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch b/kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch new file mode 100755 index 0000000..535c3af --- /dev/null +++ b/kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch @@ -0,0 +1,60 @@ +From a33ea192428d9c9307f1140f3e25631a6ef7657c Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Sat, 20 Jun 2020 15:02:59 -0400 +Subject: [PATCH 12/12] Fix use-afte-free in ip_reass() (CVE-2020-1983) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20200620150259.3352467-2-jmaloy@redhat.com> +Patchwork-id: 97678 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] Fix use-afte-free in ip_reass() (CVE-2020-1983) +Bugzilla: 1838070 +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +The q pointer is updated when the mbuf data is moved from m_dat to +m_ext. + +m_ext buffer may also be realloc()'ed and moved during m_cat(): +q should also be updated in this case. + +Reported-by: Aviv Sasson +Signed-off-by: Marc-André Lureau +Reviewed-by: Samuel Thibault + +(cherry picked from libslirp commit 9bd6c5913271eabcb7768a58197ed3301fe19f2d) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ip_input.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/slirp/src/ip_input.c b/slirp/src/ip_input.c +index df1c846ade..0f5d522ec1 100644 +--- a/slirp/src/ip_input.c ++++ b/slirp/src/ip_input.c +@@ -329,7 +329,7 @@ insert: + q = fp->frag_link.next; + m = dtom(slirp, q); + +- int was_ext = m->m_flags & M_EXT; ++ int delta = (char *)q - (m->m_flags & M_EXT ? m->m_ext : m->m_dat); + + q = (struct ipasfrag *)q->ipf_next; + while (q != (struct ipasfrag *)&fp->frag_link) { +@@ -353,8 +353,7 @@ insert: + * the old buffer (in the mbuf), so we must point ip + * into the new buffer. + */ +- if (!was_ext && m->m_flags & M_EXT) { +- int delta = (char *)q - m->m_dat; ++ if (m->m_flags & M_EXT) { + q = (struct ipasfrag *)(m->m_ext + delta); + } + +-- +2.27.0 + diff --git a/kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch b/kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch new file mode 100755 index 0000000..dce89d9 --- /dev/null +++ b/kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch @@ -0,0 +1,55 @@ +From e3bec8c83459a68ae0c08e2ae0f1dbef24872d59 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:09 +0100 +Subject: [PATCH 04/26] MAINTAINERS: fix qcow2-bitmap.c under Dirty Bitmaps + header + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-2-eblake@redhat.com> +Patchwork-id: 97068 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 01/12] MAINTAINERS: fix qcow2-bitmap.c under Dirty Bitmaps header +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +From: Vladimir Sementsov-Ogievskiy + +Somehow I wrote not full path to the file. Fix that. + +Also, while being here, rearrange entries, so that includes go first, +then block, than migration, than util. + +Fixes: 052db8e71444d +Signed-off-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 00637c6b0b67694127cc01dd75f3626da23acdaa) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/MAINTAINERS b/MAINTAINERS +index d1b3e26..3a81ac9 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1873,12 +1873,12 @@ M: John Snow + R: Vladimir Sementsov-Ogievskiy + L: qemu-block@nongnu.org + S: Supported +-F: util/hbitmap.c +-F: block/dirty-bitmap.c + F: include/qemu/hbitmap.h + F: include/block/dirty-bitmap.h +-F: qcow2-bitmap.c ++F: block/dirty-bitmap.c ++F: block/qcow2-bitmap.c + F: migration/block-dirty-bitmap.c ++F: util/hbitmap.c + F: tests/test-hbitmap.c + F: docs/interop/bitmaps.rst + T: git https://github.com/jnsnow/qemu.git bitmaps +-- +1.8.3.1 + diff --git a/kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch b/kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch new file mode 100755 index 0000000..1435017 --- /dev/null +++ b/kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch @@ -0,0 +1,53 @@ +From 481357ea8ae32b6894860c296cf6a2898260195f Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 17 Jan 2020 13:18:27 +0100 +Subject: [PATCH 4/4] RHEL: hw/i386: disable nested PERF_GLOBAL_CTRL MSR + support + +RH-Author: Paolo Bonzini +Message-id: <20200117131827.20361-1-pbonzini@redhat.com> +Patchwork-id: 93405 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v3] RHEL: hw/i386: disable nested PERF_GLOBAL_CTRL MSR support +Bugzilla: 1559846 +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Miroslav Rezanina + +BZ: 1559846 +BRANCH: rhel-av-8.2.0 +BREW: 25775160 +UPSTREAM: RHEL only + +Nested PERF_GLOBAL_CTRL support is not present in the 8.2 kernel. Drop the +features via compat properties, they will be moved to 8.2 machine type compat +properties in the 8.3 timeframe. + +Signed-off-by: Paolo Bonzini +--- + No change, for v2 I mistakenly wrote "origin/rhel-av-8.2.0" as the + branch. :( + + hw/i386/pc.c | 2 ++ + 1 file changed, 2 insertions(+) + +Signed-off-by: Miroslav Rezanina +--- + hw/i386/pc.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index 61e70e4..73a0f11 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -351,6 +351,8 @@ const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4); + GlobalProperty pc_rhel_compat[] = { + { TYPE_X86_CPU, "host-phys-bits", "on" }, + { TYPE_X86_CPU, "host-phys-bits-limit", "48" }, ++ { TYPE_X86_CPU, "vmx-entry-load-perf-global-ctrl", "off" }, ++ { TYPE_X86_CPU, "vmx-exit-load-perf-global-ctrl", "off" }, + /* bz 1508330 */ + { "vfio-pci", "x-no-geforce-quirks", "on" }, + }; +-- +1.8.3.1 + diff --git a/kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch b/kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch new file mode 100755 index 0000000..d717ae2 --- /dev/null +++ b/kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch @@ -0,0 +1,115 @@ +From c477581ccc6962651d4d6c702a6c3e2fcc5e4205 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Thu, 2 Jan 2020 11:56:51 +0000 +Subject: [PATCH 2/2] kvm: Reallocate dirty_bmap when we change a slot + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200102115651.140177-1-dgilbert@redhat.com> +Patchwork-id: 93256 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] kvm: Reallocate dirty_bmap when we change a slot +Bugzilla: 1772774 +RH-Acked-by: Peter Xu +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Laszlo Ersek + +From: "Dr. David Alan Gilbert" + +bz: https://bugzilla.redhat.com/show_bug.cgi?id=1772774 +brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=25575691 +branch: rhel-av-8.2.0 + +kvm_set_phys_mem can be called to reallocate a slot by something the +guest does (e.g. writing to PAM and other chipset registers). +This can happen in the middle of a migration, and if we're unlucky +it can now happen between the split 'sync' and 'clear'; the clear +asserts if there's no bmap to clear. Recreate the bmap whenever +we change the slot, keeping the clear path happy. + +Typically this is triggered by the guest rebooting during a migrate. + +Corresponds to: +https://bugzilla.redhat.com/show_bug.cgi?id=1772774 +https://bugzilla.redhat.com/show_bug.cgi?id=1771032 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Peter Xu +(cherry picked from commit 9b3a31c745b61758aaa5466a3a9fc0526d409188) +Signed-off-by: Danilo C. L. de Paula +--- + accel/kvm/kvm-all.c | 44 +++++++++++++++++++++++++++++--------------- + 1 file changed, 29 insertions(+), 15 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index dc3ed7f..5007bda 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -518,6 +518,27 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, + + #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) + ++/* Allocate the dirty bitmap for a slot */ ++static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) ++{ ++ /* ++ * XXX bad kernel interface alert ++ * For dirty bitmap, kernel allocates array of size aligned to ++ * bits-per-long. But for case when the kernel is 64bits and ++ * the userspace is 32bits, userspace can't align to the same ++ * bits-per-long, since sizeof(long) is different between kernel ++ * and user space. This way, userspace will provide buffer which ++ * may be 4 bytes less than the kernel will use, resulting in ++ * userspace memory corruption (which is not detectable by valgrind ++ * too, in most cases). ++ * So for now, let's align to 64 instead of HOST_LONG_BITS here, in ++ * a hope that sizeof(long) won't become >8 any time soon. ++ */ ++ hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), ++ /*HOST_LONG_BITS*/ 64) / 8; ++ mem->dirty_bmap = g_malloc0(bitmap_size); ++} ++ + /** + * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space + * +@@ -550,23 +571,9 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, + goto out; + } + +- /* XXX bad kernel interface alert +- * For dirty bitmap, kernel allocates array of size aligned to +- * bits-per-long. But for case when the kernel is 64bits and +- * the userspace is 32bits, userspace can't align to the same +- * bits-per-long, since sizeof(long) is different between kernel +- * and user space. This way, userspace will provide buffer which +- * may be 4 bytes less than the kernel will use, resulting in +- * userspace memory corruption (which is not detectable by valgrind +- * too, in most cases). +- * So for now, let's align to 64 instead of HOST_LONG_BITS here, in +- * a hope that sizeof(long) won't become >8 any time soon. +- */ + if (!mem->dirty_bmap) { +- hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), +- /*HOST_LONG_BITS*/ 64) / 8; + /* Allocate on the first log_sync, once and for all */ +- mem->dirty_bmap = g_malloc0(bitmap_size); ++ kvm_memslot_init_dirty_bitmap(mem); + } + + d.dirty_bitmap = mem->dirty_bmap; +@@ -1067,6 +1074,13 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, + mem->ram = ram; + mem->flags = kvm_mem_flags(mr); + ++ if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { ++ /* ++ * Reallocate the bmap; it means it doesn't disappear in ++ * middle of a migrate. ++ */ ++ kvm_memslot_init_dirty_bitmap(mem); ++ } + err = kvm_set_user_memory_region(kml, mem, true); + if (err) { + fprintf(stderr, "%s: error registering slot: %s\n", __func__, +-- +1.8.3.1 + diff --git a/kvm-Replace-remaining-malloc-free-user-with-glib.patch b/kvm-Replace-remaining-malloc-free-user-with-glib.patch new file mode 100755 index 0000000..71e6e47 --- /dev/null +++ b/kvm-Replace-remaining-malloc-free-user-with-glib.patch @@ -0,0 +1,118 @@ +From c012dc9b501d96a2ff54a8a7a182726043b69aeb Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Tue, 12 May 2020 21:15:14 +0100 +Subject: [PATCH 3/7] Replace remaining malloc/free user with glib +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200512211514.1398384-3-jmaloy@redhat.com> +Patchwork-id: 96413 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] Replace remaining malloc/free user with glib +Bugzilla: 1749737 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Marc-André Lureau + +glib mem functions are already used in various places. Let's not mix +the two, and instead abort on OOM conditions. + +Signed-off-by: Marc-André Lureau +(cherry picked from libslirp commit 3a494648526be4eb96cba739a816a60e933ffd14) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/sbuf.c | 21 ++++++--------------- + slirp/src/socket.c | 2 +- + slirp/src/tcp_subr.c | 8 ++------ + 3 files changed, 9 insertions(+), 22 deletions(-) + +diff --git a/slirp/src/sbuf.c b/slirp/src/sbuf.c +index 0569c34..eab87f3 100644 +--- a/slirp/src/sbuf.c ++++ b/slirp/src/sbuf.c +@@ -9,7 +9,7 @@ static void sbappendsb(struct sbuf *sb, struct mbuf *m); + + void sbfree(struct sbuf *sb) + { +- free(sb->sb_data); ++ g_free(sb->sb_data); + } + + bool sbdrop(struct sbuf *sb, int num) +@@ -39,24 +39,15 @@ void sbreserve(struct sbuf *sb, int size) + if (sb->sb_data) { + /* Already alloced, realloc if necessary */ + if (sb->sb_datalen != size) { +- char *new = realloc(sb->sb_data, size); ++ char *new = g_realloc(sb->sb_data, size); + sb->sb_cc = 0; +- if (new) { +- sb->sb_data = sb->sb_wptr = sb->sb_rptr = new; +- sb->sb_datalen = size; +- } else { +- free(sb->sb_data); +- sb->sb_data = sb->sb_wptr = sb->sb_rptr = NULL; +- sb->sb_datalen = 0; +- } ++ sb->sb_data = sb->sb_wptr = sb->sb_rptr = new; ++ sb->sb_datalen = size; + } + } else { +- sb->sb_wptr = sb->sb_rptr = sb->sb_data = (char *)malloc(size); ++ sb->sb_wptr = sb->sb_rptr = sb->sb_data = g_malloc(size); + sb->sb_cc = 0; +- if (sb->sb_wptr) +- sb->sb_datalen = size; +- else +- sb->sb_datalen = 0; ++ sb->sb_datalen = size; + } + } + +diff --git a/slirp/src/socket.c b/slirp/src/socket.c +index 34daffc..ace18bf 100644 +--- a/slirp/src/socket.c ++++ b/slirp/src/socket.c +@@ -95,7 +95,7 @@ void sofree(struct socket *so) + remque(so); /* crashes if so is not in a queue */ + + if (so->so_tcpcb) { +- free(so->so_tcpcb); ++ g_free(so->so_tcpcb); + } + g_free(so); + } +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index 26d4ead..4e5a801 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -255,11 +255,7 @@ struct tcpcb *tcp_newtcpcb(struct socket *so) + { + register struct tcpcb *tp; + +- tp = (struct tcpcb *)malloc(sizeof(*tp)); +- if (tp == NULL) +- return ((struct tcpcb *)0); +- +- memset((char *)tp, 0, sizeof(struct tcpcb)); ++ tp = g_new0(struct tcpcb, 1); + tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp; + tp->t_maxseg = (so->so_ffamily == AF_INET) ? TCP_MSS : TCP6_MSS; + +@@ -330,7 +326,7 @@ struct tcpcb *tcp_close(struct tcpcb *tp) + remque(tcpiphdr2qlink(tcpiphdr_prev(t))); + m_free(m); + } +- free(tp); ++ g_free(tp); + so->so_tcpcb = NULL; + /* clobber input socket cache if we're closing the cached connection */ + if (so == slirp->tcp_last_so) +-- +1.8.3.1 + diff --git a/kvm-Revert-RHEL-disable-hostmem-memfd.patch b/kvm-Revert-RHEL-disable-hostmem-memfd.patch new file mode 100755 index 0000000..f959752 --- /dev/null +++ b/kvm-Revert-RHEL-disable-hostmem-memfd.patch @@ -0,0 +1,58 @@ +From 559d5899473dea180ced39a32bfbfbf2310c6e04 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Mon, 25 May 2020 15:33:06 +0100 +Subject: [PATCH 4/7] Revert "RHEL: disable hostmem-memfd" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200525153306.15373-1-marcandre.lureau@redhat.com> +Patchwork-id: 96747 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH] Revert "RHEL: disable hostmem-memfd" +Bugzilla: 1839030 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefano Garzarella + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1839030 +BRANCH: rhel-av-8.2.1 +UPSTREAM: RHEL-only +BREW: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=28817132 + +This reverts commit f7587ddb9a2731bf678a24156b6285dda79a4b2b. + +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + backends/Makefile.objs | 3 +-- + util/memfd.c | 2 +- + 2 files changed, 2 insertions(+), 3 deletions(-) + +diff --git a/backends/Makefile.objs b/backends/Makefile.objs +index f328d40..f069111 100644 +--- a/backends/Makefile.objs ++++ b/backends/Makefile.objs +@@ -16,5 +16,4 @@ endif + + common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o + +-# RHEL: disable memfd +-# common-obj-$(CONFIG_LINUX) += hostmem-memfd.o ++common-obj-$(CONFIG_LINUX) += hostmem-memfd.o +diff --git a/util/memfd.c b/util/memfd.c +index 3303ec9..4a3c07e 100644 +--- a/util/memfd.c ++++ b/util/memfd.c +@@ -193,7 +193,7 @@ bool qemu_memfd_alloc_check(void) + */ + bool qemu_memfd_check(unsigned int flags) + { +-#if 0 /* RHEL: memfd support disabled */ ++#ifdef CONFIG_LINUX + int mfd = memfd_create("test", flags | MFD_CLOEXEC); + + if (mfd >= 0) { +-- +1.8.3.1 + diff --git a/kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch b/kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch new file mode 100755 index 0000000..0c1c37f --- /dev/null +++ b/kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch @@ -0,0 +1,121 @@ +From 71b5267ed33f9e60bc98acbabcbed62f01a96ff4 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 30 Mar 2020 11:19:23 +0100 +Subject: [PATCH 3/4] Revert "mirror: Don't let an operation wait for itself" + +RH-Author: Kevin Wolf +Message-id: <20200330111924.22938-2-kwolf@redhat.com> +Patchwork-id: 94464 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] Revert "mirror: Don't let an operation wait for itself" +Bugzilla: 1794692 +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +This reverts commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca. + +The fix was incomplete as it only protected against requests waiting for +themselves, but not against requests waiting for each other. We need a +different solution. + +Signed-off-by: Kevin Wolf +Message-Id: <20200326153628.4869-2-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit 9178f4fe5f083064f5c91f04d98c815ce5a5af1c) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 21 +++++++++------------ + 1 file changed, 9 insertions(+), 12 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index cacbc70..8959e42 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -283,14 +283,11 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, + } + + static inline void coroutine_fn +-mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) ++mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) + { + MirrorOp *op; + + QTAILQ_FOREACH(op, &s->ops_in_flight, next) { +- if (self == op) { +- continue; +- } + /* Do not wait on pseudo ops, because it may in turn wait on + * some other operation to start, which may in fact be the + * caller of this function. Since there is only one pseudo op +@@ -305,10 +302,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) + } + + static inline void coroutine_fn +-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self) ++mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) + { + /* Only non-active operations use up in-flight slots */ +- mirror_wait_for_any_operation(s, self, false); ++ mirror_wait_for_any_operation(s, false); + } + + /* Perform a mirror copy operation. +@@ -351,7 +348,7 @@ static void coroutine_fn mirror_co_read(void *opaque) + + while (s->buf_free_count < nb_chunks) { + trace_mirror_yield_in_flight(s, op->offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, op); ++ mirror_wait_for_free_in_flight_slot(s); + } + + /* Now make a QEMUIOVector taking enough granularity-sized chunks +@@ -558,7 +555,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) + + while (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield_in_flight(s, offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, pseudo_op); ++ mirror_wait_for_free_in_flight_slot(s); + } + + if (s->ret < 0) { +@@ -612,7 +609,7 @@ static void mirror_free_init(MirrorBlockJob *s) + static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) + { + while (s->in_flight > 0) { +- mirror_wait_for_free_in_flight_slot(s, NULL); ++ mirror_wait_for_free_in_flight_slot(s); + } + } + +@@ -797,7 +794,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) + if (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, + s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, NULL); ++ mirror_wait_for_free_in_flight_slot(s); + continue; + } + +@@ -950,7 +947,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + /* Do not start passive operations while there are active + * writes in progress */ + while (s->in_active_write_counter) { +- mirror_wait_for_any_operation(s, NULL, true); ++ mirror_wait_for_any_operation(s, true); + } + + if (s->ret < 0) { +@@ -976,7 +973,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, NULL); ++ mirror_wait_for_free_in_flight_slot(s); + continue; + } else if (cnt != 0) { + delay_ns = mirror_iteration(s); +-- +1.8.3.1 + diff --git a/kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch b/kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch new file mode 100755 index 0000000..dc65c26 --- /dev/null +++ b/kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch @@ -0,0 +1,63 @@ +From ceb6d97674b8bc9a072db1be4167411bc0ee48d7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:02 +0100 +Subject: [PATCH 091/116] Virtiofsd: fix memory leak on fuse queueinfo +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-88-dgilbert@redhat.com> +Patchwork-id: 93542 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 087/112] Virtiofsd: fix memory leak on fuse queueinfo +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +For fuse's queueinfo, both queueinfo array and queueinfos are allocated in +fv_queue_set_started() but not cleaned up when the daemon process quits. + +This fixes the leak in proper places. + +Signed-off-by: Liu Bo +Signed-off-by: Eric Ren +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 740b0b700a6338a1cf60c26229651ac5f6724944) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index b7948de..fb8d6d1 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -625,6 +625,8 @@ static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) + } + close(ourqi->kill_fd); + ourqi->kick_fd = -1; ++ free(vud->qi[qidx]); ++ vud->qi[qidx] = NULL; + } + + /* Callback from libvhost-user on start or stop of a queue */ +@@ -884,6 +886,12 @@ int virtio_session_mount(struct fuse_session *se) + void virtio_session_close(struct fuse_session *se) + { + close(se->vu_socketfd); ++ ++ if (!se->virtio_dev) { ++ return; ++ } ++ ++ free(se->virtio_dev->qi); + free(se->virtio_dev); + se->virtio_dev = NULL; + } +-- +1.8.3.1 + diff --git a/kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch b/kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch new file mode 100755 index 0000000..1538d11 --- /dev/null +++ b/kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch @@ -0,0 +1,82 @@ +From dcac680adb6b8624f14eda3e812521bddbe8ecea Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:04 -0400 +Subject: [PATCH 5/7] acpi: accept byte and word access to core ACPI registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-5-jmaloy@redhat.com> +Patchwork-id: 101482 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 4/6] acpi: accept byte and word access to core ACPI registers +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Michael Tokarev + +All ISA registers should be accessible as bytes, words or dwords +(if wide enough). Fix the access constraints for acpi-pm-evt, +acpi-pm-tmr & acpi-cnt registers. + +Fixes: 5d971f9e67 (memory: Revert "memory: accept mismatching sizes in memory_region_access_valid") +Fixes: afafe4bbe0 (apci: switch cnt to memory api) +Fixes: 77d58b1e47 (apci: switch timer to memory api) +Fixes: b5a7c024d2 (apci: switch evt to memory api) +Buglink: https://lore.kernel.org/xen-devel/20200630170913.123646-1-anthony.perard@citrix.com/T/ +Buglink: https://bugs.debian.org/964793 +BugLink: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=964247 +BugLink: https://bugs.launchpad.net/bugs/1886318 +Reported-By: Simon John +Signed-off-by: Michael Tokarev +Message-Id: <20200720160627.15491-1-mjt@msgid.tls.msk.ru> +Cc: qemu-stable@nongnu.org +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin + +(cherry picked from commit dba04c3488c4699f5afe96f66e448b1d447cf3fb) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/core.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/hw/acpi/core.c b/hw/acpi/core.c +index 45cbed49ab..d85052c34a 100644 +--- a/hw/acpi/core.c ++++ b/hw/acpi/core.c +@@ -461,7 +461,8 @@ static void acpi_pm_evt_write(void *opaque, hwaddr addr, uint64_t val, + static const MemoryRegionOps acpi_pm_evt_ops = { + .read = acpi_pm_evt_read, + .write = acpi_pm_evt_write, +- .valid.min_access_size = 2, ++ .impl.min_access_size = 2, ++ .valid.min_access_size = 1, + .valid.max_access_size = 2, + .endianness = DEVICE_LITTLE_ENDIAN, + }; +@@ -530,7 +531,8 @@ static void acpi_pm_tmr_write(void *opaque, hwaddr addr, uint64_t val, + static const MemoryRegionOps acpi_pm_tmr_ops = { + .read = acpi_pm_tmr_read, + .write = acpi_pm_tmr_write, +- .valid.min_access_size = 4, ++ .impl.min_access_size = 4, ++ .valid.min_access_size = 1, + .valid.max_access_size = 4, + .endianness = DEVICE_LITTLE_ENDIAN, + }; +@@ -602,7 +604,8 @@ static void acpi_pm_cnt_write(void *opaque, hwaddr addr, uint64_t val, + static const MemoryRegionOps acpi_pm_cnt_ops = { + .read = acpi_pm_cnt_read, + .write = acpi_pm_cnt_write, +- .valid.min_access_size = 2, ++ .impl.min_access_size = 2, ++ .valid.min_access_size = 1, + .valid.max_access_size = 2, + .endianness = DEVICE_LITTLE_ENDIAN, + }; +-- +2.27.0 + diff --git a/kvm-aio-posix-completely-stop-polling-when-disabled.patch b/kvm-aio-posix-completely-stop-polling-when-disabled.patch new file mode 100755 index 0000000..3993181 --- /dev/null +++ b/kvm-aio-posix-completely-stop-polling-when-disabled.patch @@ -0,0 +1,104 @@ +From 4b4fb1cccb8e0307658cee3bc90c77e5f1dde60a Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:49 -0400 +Subject: [PATCH 13/14] aio-posix: completely stop polling when disabled + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-10-thuth@redhat.com> +Patchwork-id: 98603 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 9/9] aio-posix: completely stop polling when disabled +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +From: Stefan Hajnoczi + +One iteration of polling is always performed even when polling is +disabled. This is done because: +1. Userspace polling is cheaper than making a syscall. We might get + lucky. +2. We must poll once more after polling has stopped in case an event + occurred while stopping polling. + +However, there are downsides: +1. Polling becomes a bottleneck when the number of event sources is very + high. It's more efficient to monitor fds in that case. +2. A high-frequency polling event source can starve non-polling event + sources because ppoll(2)/epoll(7) is never invoked. + +This patch removes the forced polling iteration so that poll_ns=0 really +means no polling. + +IOPS increases from 10k to 60k when the guest has 100 +virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 +device because the large number of event sources being polled slows down +the event loop. + +Signed-off-by: Stefan Hajnoczi +Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com +Message-Id: <20200305170806.1313245-2-stefanha@redhat.com> +(cherry picked from commit e4346192f1c2e1683a807b46efac47ef0cf9b545) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + util/aio-posix.c | 22 +++++++++++++++------- + 1 file changed, 15 insertions(+), 7 deletions(-) + +diff --git a/util/aio-posix.c b/util/aio-posix.c +index a4977f538e..abc396d030 100644 +--- a/util/aio-posix.c ++++ b/util/aio-posix.c +@@ -340,12 +340,13 @@ void aio_set_event_notifier_poll(AioContext *ctx, + (IOHandler *)io_poll_end); + } + +-static void poll_set_started(AioContext *ctx, bool started) ++static bool poll_set_started(AioContext *ctx, bool started) + { + AioHandler *node; ++ bool progress = false; + + if (started == ctx->poll_started) { +- return; ++ return false; + } + + ctx->poll_started = started; +@@ -367,8 +368,15 @@ static void poll_set_started(AioContext *ctx, bool started) + if (fn) { + fn(node->opaque); + } ++ ++ /* Poll one last time in case ->io_poll_end() raced with the event */ ++ if (!started) { ++ progress = node->io_poll(node->opaque) || progress; ++ } + } + qemu_lockcnt_dec(&ctx->list_lock); ++ ++ return progress; + } + + +@@ -599,12 +607,12 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) + } + } + +- poll_set_started(ctx, false); ++ if (poll_set_started(ctx, false)) { ++ *timeout = 0; ++ return true; ++ } + +- /* Even if we don't run busy polling, try polling once in case it can make +- * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2). +- */ +- return run_poll_handlers_once(ctx, timeout); ++ return false; + } + + bool aio_poll(AioContext *ctx, bool blocking) +-- +2.27.0 + diff --git a/kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch b/kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch new file mode 100755 index 0000000..a234140 --- /dev/null +++ b/kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch @@ -0,0 +1,132 @@ +From b474155fdc38f86f516c14ba9a6f934616d589ef Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Wed, 4 Aug 2021 03:27:22 -0400 +Subject: [PATCH 1/2] aio-wait: delegate polling of main AioContext if BQL not + held + +RH-Author: Andrew Jones +Message-id: <20210729134448.4995-2-drjones@redhat.com> +Patchwork-id: 101935 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 1/2] aio-wait: delegate polling of main AioContext if BQL not held +Bugzilla: 1969848 +RH-Acked-by: Gavin Shan +RH-Acked-by: Auger Eric +RH-Acked-by: Stefan Hajnoczi + +From: Paolo Bonzini + +Any thread that is not a iothread returns NULL for qemu_get_current_aio_context(). +As a result, it would also return true for +in_aio_context_home_thread(qemu_get_aio_context()), causing +AIO_WAIT_WHILE to invoke aio_poll() directly. This is incorrect +if the BQL is not held, because aio_poll() does not expect to +run concurrently from multiple threads, and it can actually +happen when savevm writes to the vmstate file from the +migration thread. + +Therefore, restrict in_aio_context_home_thread to return true +for the main AioContext only if the BQL is held. + +The function is moved to aio-wait.h because it is mostly used +there and to avoid a circular reference between main-loop.h +and block/aio.h. + +Signed-off-by: Paolo Bonzini +Message-Id: <20200407140746.8041-5-pbonzini@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 3c18a92dc4b55ca8cc37a755ed119f11c0f34099) +Signed-off-by: Andrew Jones +Signed-off-by: Miroslav Rezanina +--- + include/block/aio-wait.h | 22 ++++++++++++++++++++++ + include/block/aio.h | 29 ++++++++++------------------- + 2 files changed, 32 insertions(+), 19 deletions(-) + +diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h +index afeeb18f95..716d2639df 100644 +--- a/include/block/aio-wait.h ++++ b/include/block/aio-wait.h +@@ -26,6 +26,7 @@ + #define QEMU_AIO_WAIT_H + + #include "block/aio.h" ++#include "qemu/main-loop.h" + + /** + * AioWait: +@@ -124,4 +125,25 @@ void aio_wait_kick(void); + */ + void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); + ++/** ++ * in_aio_context_home_thread: ++ * @ctx: the aio context ++ * ++ * Return whether we are running in the thread that normally runs @ctx. Note ++ * that acquiring/releasing ctx does not affect the outcome, each AioContext ++ * still only has one home thread that is responsible for running it. ++ */ ++static inline bool in_aio_context_home_thread(AioContext *ctx) ++{ ++ if (ctx == qemu_get_current_aio_context()) { ++ return true; ++ } ++ ++ if (ctx == qemu_get_aio_context()) { ++ return qemu_mutex_iothread_locked(); ++ } else { ++ return false; ++ } ++} ++ + #endif /* QEMU_AIO_WAIT_H */ +diff --git a/include/block/aio.h b/include/block/aio.h +index 6b0d52f732..9d28e247df 100644 +--- a/include/block/aio.h ++++ b/include/block/aio.h +@@ -60,12 +60,16 @@ struct AioContext { + QLIST_HEAD(, AioHandler) aio_handlers; + + /* Used to avoid unnecessary event_notifier_set calls in aio_notify; +- * accessed with atomic primitives. If this field is 0, everything +- * (file descriptors, bottom halves, timers) will be re-evaluated +- * before the next blocking poll(), thus the event_notifier_set call +- * can be skipped. If it is non-zero, you may need to wake up a +- * concurrent aio_poll or the glib main event loop, making +- * event_notifier_set necessary. ++ * only written from the AioContext home thread, or under the BQL in ++ * the case of the main AioContext. However, it is read from any ++ * thread so it is still accessed with atomic primitives. ++ * ++ * If this field is 0, everything (file descriptors, bottom halves, ++ * timers) will be re-evaluated before the next blocking poll() or ++ * io_uring wait; therefore, the event_notifier_set call can be ++ * skipped. If it is non-zero, you may need to wake up a concurrent ++ * aio_poll or the glib main event loop, making event_notifier_set ++ * necessary. + * + * Bit 0 is reserved for GSource usage of the AioContext, and is 1 + * between a call to aio_ctx_prepare and the next call to aio_ctx_check. +@@ -580,19 +584,6 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co); + */ + AioContext *qemu_get_current_aio_context(void); + +-/** +- * in_aio_context_home_thread: +- * @ctx: the aio context +- * +- * Return whether we are running in the thread that normally runs @ctx. Note +- * that acquiring/releasing ctx does not affect the outcome, each AioContext +- * still only has one home thread that is responsible for running it. +- */ +-static inline bool in_aio_context_home_thread(AioContext *ctx) +-{ +- return ctx == qemu_get_current_aio_context(); +-} +- + /** + * aio_context_setup: + * @ctx: the aio context +-- +2.27.0 + diff --git a/kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch b/kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch new file mode 100755 index 0000000..becba21 --- /dev/null +++ b/kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch @@ -0,0 +1,62 @@ +From 0d5a09173eb75b7e56122c2aefb2646a2be58400 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 31 Jan 2020 17:12:57 +0000 +Subject: [PATCH 15/15] apic: Use 32bit APIC ID for migration instance ID + +RH-Author: Peter Xu +Message-id: <20200131171257.1066593-4-peterx@redhat.com> +Patchwork-id: 93628 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/3] apic: Use 32bit APIC ID for migration instance ID +Bugzilla: 1529231 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +Migration is silently broken now with x2apic config like this: + + -smp 200,maxcpus=288,sockets=2,cores=72,threads=2 \ + -device intel-iommu,intremap=on,eim=on + +After migration, the guest kernel could hang at anything, due to +x2apic bit not migrated correctly in IA32_APIC_BASE on some vcpus, so +any operations related to x2apic could be broken then (e.g., RDMSR on +x2apic MSRs could fail because KVM would think that the vcpu hasn't +enabled x2apic at all). + +The issue is that the x2apic bit was never applied correctly for vcpus +whose ID > 255 when migrate completes, and that's because when we +migrate APIC we use the APICCommonState.id as instance ID of the +migration stream, while that's too short for x2apic. + +Let's use the newly introduced initial_apic_id for that. + +Signed-off-by: Peter Xu +Reviewed-by: Juan Quintela +Reviewed-by: Eduardo Habkost +Signed-off-by: Juan Quintela +(cherry picked from commit 0ab994867c365db21e15f9503922c79234d8e40e) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/apic_common.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c +index 54b8731..b5dbeb6 100644 +--- a/hw/intc/apic_common.c ++++ b/hw/intc/apic_common.c +@@ -268,7 +268,10 @@ static void apic_common_realize(DeviceState *dev, Error **errp) + APICCommonState *s = APIC_COMMON(dev); + APICCommonClass *info; + static DeviceState *vapic; +- uint32_t instance_id = s->id; ++ uint32_t instance_id = s->initial_apic_id; ++ ++ /* Normally initial APIC ID should be no more than hundreds */ ++ assert(instance_id != VMSTATE_INSTANCE_ID_ANY); + + info = APIC_COMMON_GET_CLASS(s); + info->realize(dev, errp); +-- +1.8.3.1 + diff --git a/kvm-async-use-explicit-memory-barriers.patch b/kvm-async-use-explicit-memory-barriers.patch new file mode 100755 index 0000000..2bf7245 --- /dev/null +++ b/kvm-async-use-explicit-memory-barriers.patch @@ -0,0 +1,183 @@ +From 82a02aec3a8b3c2ac925d0b71ea4c35aa5d6463b Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Wed, 4 Aug 2021 03:27:24 -0400 +Subject: [PATCH 2/2] async: use explicit memory barriers + +RH-Author: Andrew Jones +Message-id: <20210729134448.4995-3-drjones@redhat.com> +Patchwork-id: 101937 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 2/2] async: use explicit memory barriers +Bugzilla: 1969848 +RH-Acked-by: Gavin Shan +RH-Acked-by: Auger Eric +RH-Acked-by: Stefan Hajnoczi + +From: Paolo Bonzini + +When using C11 atomics, non-seqcst reads and writes do not participate +in the total order of seqcst operations. In util/async.c and util/aio-posix.c, +in particular, the pattern that we use + + write ctx->notify_me write bh->scheduled + read bh->scheduled read ctx->notify_me + if !bh->scheduled, sleep if ctx->notify_me, notify + +needs to use seqcst operations for both the write and the read. In +general this is something that we do not want, because there can be +many sources that are polled in addition to bottom halves. The +alternative is to place a seqcst memory barrier between the write +and the read. This also comes with a disadvantage, in that the +memory barrier is implicit on strongly-ordered architectures and +it wastes a few dozen clock cycles. + +Fortunately, ctx->notify_me is never written concurrently by two +threads, so we can assert that and relax the writes to ctx->notify_me. +The resulting solution works and performs well on both aarch64 and x86. + +Note that the atomic_set/atomic_read combination is not an atomic +read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED; +on x86, ATOMIC_RELAXED compiles to a locked operation. + +Analyzed-by: Ying Fang +Signed-off-by: Paolo Bonzini +Tested-by: Ying Fang +Message-Id: <20200407140746.8041-6-pbonzini@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c) +Signed-off-by: Andrew Jones +Signed-off-by: Miroslav Rezanina +--- + util/aio-posix.c | 16 ++++++++++++++-- + util/aio-win32.c | 17 ++++++++++++++--- + util/async.c | 16 ++++++++++++---- + 3 files changed, 40 insertions(+), 9 deletions(-) + +diff --git a/util/aio-posix.c b/util/aio-posix.c +index abc396d030..8cfb25650d 100644 +--- a/util/aio-posix.c ++++ b/util/aio-posix.c +@@ -624,6 +624,11 @@ bool aio_poll(AioContext *ctx, bool blocking) + int64_t timeout; + int64_t start = 0; + ++ /* ++ * There cannot be two concurrent aio_poll calls for the same AioContext (or ++ * an aio_poll concurrent with a GSource prepare/check/dispatch callback). ++ * We rely on this below to avoid slow locked accesses to ctx->notify_me. ++ */ + assert(in_aio_context_home_thread(ctx)); + + /* aio_notify can avoid the expensive event_notifier_set if +@@ -634,7 +639,13 @@ bool aio_poll(AioContext *ctx, bool blocking) + * so disable the optimization now. + */ + if (blocking) { +- atomic_add(&ctx->notify_me, 2); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); +@@ -679,7 +690,8 @@ bool aio_poll(AioContext *ctx, bool blocking) + } + + if (blocking) { +- atomic_sub(&ctx->notify_me, 2); ++ /* Finish the poll before clearing the flag. */ ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + +diff --git a/util/aio-win32.c b/util/aio-win32.c +index a23b9c364d..729d533faf 100644 +--- a/util/aio-win32.c ++++ b/util/aio-win32.c +@@ -321,6 +321,12 @@ bool aio_poll(AioContext *ctx, bool blocking) + int count; + int timeout; + ++ /* ++ * There cannot be two concurrent aio_poll calls for the same AioContext (or ++ * an aio_poll concurrent with a GSource prepare/check/dispatch callback). ++ * We rely on this below to avoid slow locked accesses to ctx->notify_me. ++ */ ++ assert(in_aio_context_home_thread(ctx)); + progress = false; + + /* aio_notify can avoid the expensive event_notifier_set if +@@ -331,7 +337,13 @@ bool aio_poll(AioContext *ctx, bool blocking) + * so disable the optimization now. + */ + if (blocking) { +- atomic_add(&ctx->notify_me, 2); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); +@@ -364,8 +376,7 @@ bool aio_poll(AioContext *ctx, bool blocking) + ret = WaitForMultipleObjects(count, events, FALSE, timeout); + if (blocking) { + assert(first); +- assert(in_aio_context_home_thread(ctx)); +- atomic_sub(&ctx->notify_me, 2); ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + +diff --git a/util/async.c b/util/async.c +index b1fa5319e5..c65c58bbc9 100644 +--- a/util/async.c ++++ b/util/async.c +@@ -220,7 +220,14 @@ aio_ctx_prepare(GSource *source, gint *timeout) + { + AioContext *ctx = (AioContext *) source; + +- atomic_or(&ctx->notify_me, 1); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1); ++ ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + + /* We assume there is no timeout already supplied */ + *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); +@@ -238,7 +245,8 @@ aio_ctx_check(GSource *source) + AioContext *ctx = (AioContext *) source; + QEMUBH *bh; + +- atomic_and(&ctx->notify_me, ~1); ++ /* Finish computing the timeout before clearing the flag. */ ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1); + aio_notify_accept(ctx); + + for (bh = ctx->first_bh; bh; bh = bh->next) { +@@ -343,10 +351,10 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) + void aio_notify(AioContext *ctx) + { + /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs +- * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll. ++ * with smp_mb in aio_ctx_prepare or aio_poll. + */ + smp_mb(); +- if (ctx->notify_me) { ++ if (atomic_read(&ctx->notify_me)) { + event_notifier_set(&ctx->notifier); + atomic_mb_set(&ctx->notified, true); + } +-- +2.27.0 + diff --git a/kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch b/kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch new file mode 100755 index 0000000..1a20688 --- /dev/null +++ b/kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch @@ -0,0 +1,53 @@ +From 96c8fcafa7325cd0e8a23a743a55f0ad0aa9f79b Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann +Date: Thu, 18 Mar 2021 09:13:42 -0400 +Subject: [PATCH 5/5] audio: audio_generic_get_buffer_in should honor *size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Gerd Hoffmann +Message-id: <20210318091342.3232471-2-kraxel@redhat.com> +Patchwork-id: 101352 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] audio: audio_generic_get_buffer_in should honor *size +Bugzilla: 1932823 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé + +From: Volker Rümelin + +The function generic_get_buffer_in currently ignores the *size +parameter and may return a buffer larger than *size. + +As a result the variable samples in function +audio_pcm_hw_run_in may underflow. The while loop then most +likely will never termiate. + +Buglink: http://bugs.debian.org/948658 +Signed-off-by: Volker Rümelin +Message-Id: <20200123074943.6699-9-vr_qemu@t-online.de> +Signed-off-by: Gerd Hoffmann +(cherry picked from commit 599eac4e5a41e828645594097daee39373acc3c0) +Signed-off-by: Danilo C. L. de Paula +--- + audio/audio.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/audio/audio.c b/audio/audio.c +index 56fae55047..39a62fc62a 100644 +--- a/audio/audio.c ++++ b/audio/audio.c +@@ -1402,7 +1402,8 @@ void *audio_generic_get_buffer_in(HWVoiceIn *hw, size_t *size) + } + assert(start >= 0 && start < hw->size_emul); + +- *size = MIN(hw->pending_emul, hw->size_emul - start); ++ *size = MIN(*size, hw->pending_emul); ++ *size = MIN(*size, hw->size_emul - start); + return hw->buf_emul + start; + } + +-- +2.27.0 + diff --git a/kvm-backup-Improve-error-for-bdrv_getlength-failure.patch b/kvm-backup-Improve-error-for-bdrv_getlength-failure.patch new file mode 100755 index 0000000..8fa2629 --- /dev/null +++ b/kvm-backup-Improve-error-for-bdrv_getlength-failure.patch @@ -0,0 +1,51 @@ +From fba183faf8ce819262a1a47f8531ea68051cdce7 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:19 +0100 +Subject: [PATCH 20/26] backup: Improve error for bdrv_getlength() failure + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-6-kwolf@redhat.com> +Patchwork-id: 97103 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 05/11] backup: Improve error for bdrv_getlength() failure +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +bdrv_get_device_name() will be an empty string with modern management +tools that don't use -drive. Use bdrv_get_device_or_node_name() instead +so that the node name is used if the BlockBackend is anonymous. + +While at it, start with upper case to make the message consistent with +the rest of the function. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Message-Id: <20200430142755.315494-3-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 58226634c4b02af7b10862f7fbd3610a344bfb7f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/backup.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/backup.c b/block/backup.c +index ec50946..7c6ddd2 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -408,8 +408,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + + len = bdrv_getlength(bs); + if (len < 0) { +- error_setg_errno(errp, -len, "unable to get length for '%s'", +- bdrv_get_device_name(bs)); ++ error_setg_errno(errp, -len, "Unable to get length for '%s'", ++ bdrv_get_device_or_node_name(bs)); + goto error; + } + +-- +1.8.3.1 + diff --git a/kvm-backup-Make-sure-that-source-and-target-size-match.patch b/kvm-backup-Make-sure-that-source-and-target-size-match.patch new file mode 100755 index 0000000..05b5d10 --- /dev/null +++ b/kvm-backup-Make-sure-that-source-and-target-size-match.patch @@ -0,0 +1,124 @@ +From e56abd782be8bb41bb07c0317d008f95ec9a8ee5 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:20 +0100 +Subject: [PATCH 21/26] backup: Make sure that source and target size match + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-7-kwolf@redhat.com> +Patchwork-id: 97107 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 06/11] backup: Make sure that source and target size match +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +Since the introduction of a backup filter node in commit 00e30f05d, the +backup block job crashes when the target image is smaller than the +source image because it will try to write after the end of the target +node without having BLK_PERM_RESIZE. (Previously, the BlockBackend layer +would have caught this and errored out gracefully.) + +We can fix this and even do better than the old behaviour: Check that +source and target have the same image size at the start of the block job +and unshare BLK_PERM_RESIZE. (This permission was already unshared +before the same commit 00e30f05d, but the BlockBackend that was used to +make the restriction was removed without a replacement.) This will +immediately error out when starting the job instead of only when writing +to a block that doesn't exist in the target. + +Longer target than source would technically work because we would never +write to blocks that don't exist, but semantically these are invalid, +too, because a backup is supposed to create a copy, not just an image +that starts with a copy. + +Fixes: 00e30f05de1d19586345ec373970ef4c192c6270 +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1778593 +Cc: qemu-stable@nongnu.org +Signed-off-by: Kevin Wolf +Message-Id: <20200430142755.315494-4-kwolf@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 958a04bd32af18d9a207bcc78046e56a202aebc2) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/backup-top.c | 14 +++++++++----- + block/backup.c | 14 +++++++++++++- + 2 files changed, 22 insertions(+), 6 deletions(-) + +diff --git a/block/backup-top.c b/block/backup-top.c +index b8d863f..6756091 100644 +--- a/block/backup-top.c ++++ b/block/backup-top.c +@@ -143,8 +143,10 @@ static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c, + * + * Share write to target (child_file), to not interfere + * with guest writes to its disk which may be in target backing chain. ++ * Can't resize during a backup block job because we check the size ++ * only upfront. + */ +- *nshared = BLK_PERM_ALL; ++ *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; + *nperm = BLK_PERM_WRITE; + } else { + /* Source child */ +@@ -154,7 +156,7 @@ static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c, + if (perm & BLK_PERM_WRITE) { + *nperm = *nperm | BLK_PERM_CONSISTENT_READ; + } +- *nshared &= ~BLK_PERM_WRITE; ++ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); + } + } + +@@ -187,10 +189,12 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState *source, + { + Error *local_err = NULL; + BDRVBackupTopState *state; +- BlockDriverState *top = bdrv_new_open_driver(&bdrv_backup_top_filter, +- filter_node_name, +- BDRV_O_RDWR, errp); ++ BlockDriverState *top; ++ ++ assert(source->total_sectors == target->total_sectors); + ++ top = bdrv_new_open_driver(&bdrv_backup_top_filter, filter_node_name, ++ BDRV_O_RDWR, errp); + if (!top) { + return NULL; + } +diff --git a/block/backup.c b/block/backup.c +index 7c6ddd2..821c9fb 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -348,7 +348,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque, + JobTxn *txn, Error **errp) + { +- int64_t len; ++ int64_t len, target_len; + BackupBlockJob *job = NULL; + int64_t cluster_size; + BdrvRequestFlags write_flags; +@@ -413,6 +413,18 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + goto error; + } + ++ target_len = bdrv_getlength(target); ++ if (target_len < 0) { ++ error_setg_errno(errp, -target_len, "Unable to get length for '%s'", ++ bdrv_get_device_or_node_name(bs)); ++ goto error; ++ } ++ ++ if (target_len != len) { ++ error_setg(errp, "Source and target image have different sizes"); ++ goto error; ++ } ++ + cluster_size = backup_calculate_cluster_size(target, errp); + if (cluster_size < 0) { + goto error; +-- +1.8.3.1 + diff --git a/kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch b/kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch new file mode 100755 index 0000000..7fb76c1 --- /dev/null +++ b/kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch @@ -0,0 +1,57 @@ +From 619b3aac9790a7ca7c01846144395a318a9ab250 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:14 +0100 +Subject: [PATCH 3/6] backup: don't acquire aio_context in backup_clean + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-4-kwolf@redhat.com> +Patchwork-id: 94596 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/6] backup: don't acquire aio_context in backup_clean +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +From: Stefan Reiter + +All code-paths leading to backup_clean (via job_clean) have the job's +context already acquired. The job's context is guaranteed to be the same +as the one used by backup_top via backup_job_create. + +Since the previous logic effectively acquired the lock twice, this +broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE +in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock +once, thus deadlocking with the IO thread. + +This is a partial revert of 0abf2581717a19. + +Signed-off-by: Stefan Reiter +Reviewed-by: Max Reitz +Message-Id: <20200407115651.69472-4-s.reiter@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit eca0f3524a4eb57d03a56b0cbcef5527a0981ce4) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/backup.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/block/backup.c b/block/backup.c +index 1383e21..ec50946 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -135,11 +135,7 @@ static void backup_abort(Job *job) + static void backup_clean(Job *job) + { + BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); +- AioContext *aio_context = bdrv_get_aio_context(s->backup_top); +- +- aio_context_acquire(aio_context); + bdrv_backup_top_drop(s->backup_top); +- aio_context_release(aio_context); + } + + void backup_do_checkpoint(BlockJob *job, Error **errp) +-- +1.8.3.1 + diff --git a/kvm-backup-top-Begin-drain-earlier.patch b/kvm-backup-top-Begin-drain-earlier.patch new file mode 100755 index 0000000..ef289b7 --- /dev/null +++ b/kvm-backup-top-Begin-drain-earlier.patch @@ -0,0 +1,56 @@ +From bc78ee07bf400cbff0021367e05d308870471710 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:45 +0000 +Subject: [PATCH 12/18] backup-top: Begin drain earlier + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-6-slp@redhat.com> +Patchwork-id: 93757 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 5/9] backup-top: Begin drain earlier +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +From: Max Reitz + +When dropping backup-top, we need to drain the node before freeing the +BlockCopyState. Otherwise, requests may still be in flight and then the +assertion in shres_destroy() will fail. + +(This becomes visible in intermittent failure of 056.) + +Cc: qemu-stable@nongnu.org +Signed-off-by: Max Reitz +Message-id: 20191219182638.104621-1-mreitz@redhat.com +Signed-off-by: Max Reitz +(cherry picked from commit 503ca1262bab2c11c533a4816d1ff4297d4f58a6) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + block/backup-top.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/backup-top.c b/block/backup-top.c +index 7cdb1f8..818d3f2 100644 +--- a/block/backup-top.c ++++ b/block/backup-top.c +@@ -257,12 +257,12 @@ void bdrv_backup_top_drop(BlockDriverState *bs) + BDRVBackupTopState *s = bs->opaque; + AioContext *aio_context = bdrv_get_aio_context(bs); + +- block_copy_state_free(s->bcs); +- + aio_context_acquire(aio_context); + + bdrv_drained_begin(bs); + ++ block_copy_state_free(s->bcs); ++ + s->active = false; + bdrv_child_refresh_perms(bs, bs->backing, &error_abort); + bdrv_replace_node(bs, backing_bs(bs), &error_abort); +-- +1.8.3.1 + diff --git a/kvm-block-Activate-recursively-even-for-already-active-n.patch b/kvm-block-Activate-recursively-even-for-already-active-n.patch new file mode 100755 index 0000000..d6cad06 --- /dev/null +++ b/kvm-block-Activate-recursively-even-for-already-active-n.patch @@ -0,0 +1,116 @@ +From 0ef6691ce8964bb2bbd677756c4e594793ca3ad8 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:01 +0000 +Subject: [PATCH 04/18] block: Activate recursively even for already active + nodes + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-4-kwolf@redhat.com> +Patchwork-id: 93749 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 3/6] block: Activate recursively even for already active nodes +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +bdrv_invalidate_cache_all() assumes that all nodes in a given subtree +are either active or inactive when it starts. Therefore, as soon as it +arrives at an already active node, it stops. + +However, this assumption is wrong. For example, it's possible to take a +snapshot of an inactive node, which results in an active overlay over an +inactive backing file. The active overlay is probably also the root node +of an inactive BlockBackend (blk->disable_perm == true). + +In this case, bdrv_invalidate_cache_all() does not need to do anything +to activate the overlay node, but it still needs to recurse into the +children and the parents to make sure that after returning success, +really everything is activated. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +(cherry picked from commit 7bb4941ace471fc7dd6ded4749b95b9622baa6ed) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 50 ++++++++++++++++++++++++-------------------------- + 1 file changed, 24 insertions(+), 26 deletions(-) + +diff --git a/block.c b/block.c +index 473eb6e..2e5e8b6 100644 +--- a/block.c ++++ b/block.c +@@ -5335,10 +5335,6 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, + return; + } + +- if (!(bs->open_flags & BDRV_O_INACTIVE)) { +- return; +- } +- + QLIST_FOREACH(child, &bs->children, next) { + bdrv_co_invalidate_cache(child->bs, &local_err); + if (local_err) { +@@ -5360,34 +5356,36 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, + * just keep the extended permissions for the next time that an activation + * of the image is tried. + */ +- bs->open_flags &= ~BDRV_O_INACTIVE; +- bdrv_get_cumulative_perm(bs, &perm, &shared_perm); +- ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, NULL, &local_err); +- if (ret < 0) { +- bs->open_flags |= BDRV_O_INACTIVE; +- error_propagate(errp, local_err); +- return; +- } +- bdrv_set_perm(bs, perm, shared_perm); +- +- if (bs->drv->bdrv_co_invalidate_cache) { +- bs->drv->bdrv_co_invalidate_cache(bs, &local_err); +- if (local_err) { ++ if (bs->open_flags & BDRV_O_INACTIVE) { ++ bs->open_flags &= ~BDRV_O_INACTIVE; ++ bdrv_get_cumulative_perm(bs, &perm, &shared_perm); ++ ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, NULL, &local_err); ++ if (ret < 0) { + bs->open_flags |= BDRV_O_INACTIVE; + error_propagate(errp, local_err); + return; + } +- } ++ bdrv_set_perm(bs, perm, shared_perm); + +- FOR_EACH_DIRTY_BITMAP(bs, bm) { +- bdrv_dirty_bitmap_skip_store(bm, false); +- } ++ if (bs->drv->bdrv_co_invalidate_cache) { ++ bs->drv->bdrv_co_invalidate_cache(bs, &local_err); ++ if (local_err) { ++ bs->open_flags |= BDRV_O_INACTIVE; ++ error_propagate(errp, local_err); ++ return; ++ } ++ } + +- ret = refresh_total_sectors(bs, bs->total_sectors); +- if (ret < 0) { +- bs->open_flags |= BDRV_O_INACTIVE; +- error_setg_errno(errp, -ret, "Could not refresh total sector count"); +- return; ++ FOR_EACH_DIRTY_BITMAP(bs, bm) { ++ bdrv_dirty_bitmap_skip_store(bm, false); ++ } ++ ++ ret = refresh_total_sectors(bs, bs->total_sectors); ++ if (ret < 0) { ++ bs->open_flags |= BDRV_O_INACTIVE; ++ error_setg_errno(errp, -ret, "Could not refresh total sector count"); ++ return; ++ } + } + + QLIST_FOREACH(parent, &bs->parents, next_parent) { +-- +1.8.3.1 + diff --git a/kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch b/kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch new file mode 100755 index 0000000..bc67279 --- /dev/null +++ b/kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch @@ -0,0 +1,283 @@ +From 13e2076f5c4adbc9a3f96c8978150aa5e423e14a Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:30 +0100 +Subject: [PATCH 02/17] block: Add flags to BlockDriver.bdrv_co_truncate() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-2-kwolf@redhat.com> +Patchwork-id: 97448 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 01/11] block: Add flags to BlockDriver.bdrv_co_truncate() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +This adds a new BdrvRequestFlags parameter to the .bdrv_co_truncate() +driver callbacks, and a supported_truncate_flags field in +BlockDriverState that allows drivers to advertise support for request +flags in the context of truncate. + +For now, we always pass 0 and no drivers declare support for any flag. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-2-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 92b92799dc8662b6f71809100a4aabc1ae408ebb) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/crypto.c | 3 ++- + block/file-posix.c | 2 +- + block/file-win32.c | 2 +- + block/gluster.c | 1 + + block/io.c | 8 +++++++- + block/iscsi.c | 2 +- + block/nfs.c | 3 ++- + block/qcow2.c | 2 +- + block/qed.c | 1 + + block/raw-format.c | 2 +- + block/rbd.c | 1 + + block/sheepdog.c | 4 ++-- + block/ssh.c | 2 +- + include/block/block_int.h | 10 +++++++++- + tests/test-block-iothread.c | 3 ++- + 15 files changed, 33 insertions(+), 13 deletions(-) + +diff --git a/block/crypto.c b/block/crypto.c +index 5e3b15c..6e4b726 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -299,7 +299,8 @@ static int block_crypto_co_create_generic(BlockDriverState *bs, + + static int coroutine_fn + block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + BlockCrypto *crypto = bs->opaque; + uint64_t payload_offset = +diff --git a/block/file-posix.c b/block/file-posix.c +index 1609598..7551e8d 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2021,7 +2021,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, + + static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVRawState *s = bs->opaque; + struct stat st; +diff --git a/block/file-win32.c b/block/file-win32.c +index 1585983..a6b0dda 100644 +--- a/block/file-win32.c ++++ b/block/file-win32.c +@@ -469,7 +469,7 @@ static void raw_close(BlockDriverState *bs) + + static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVRawState *s = bs->opaque; + LONG low, high; +diff --git a/block/gluster.c b/block/gluster.c +index 0aa1f2c..d06df90 100644 +--- a/block/gluster.c ++++ b/block/gluster.c +@@ -1228,6 +1228,7 @@ static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs, + int64_t offset, + bool exact, + PreallocMode prealloc, ++ BdrvRequestFlags flags, + Error **errp) + { + BDRVGlusterState *s = bs->opaque; +diff --git a/block/io.c b/block/io.c +index f75777f..549e5a4 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3320,6 +3320,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + BlockDriverState *bs = child->bs; + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; ++ BdrvRequestFlags flags = 0; + int64_t old_size, new_bytes; + int ret; + +@@ -3370,7 +3371,12 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + } + + if (drv->bdrv_co_truncate) { +- ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp); ++ if (flags & ~bs->supported_truncate_flags) { ++ error_setg(errp, "Block driver does not support requested flags"); ++ ret = -ENOTSUP; ++ goto out; ++ } ++ ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); + } else if (bs->file && drv->is_filter) { + ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); + } else { +diff --git a/block/iscsi.c b/block/iscsi.c +index 16b0716..0bea2d3 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -2125,7 +2125,7 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state) + + static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + IscsiLun *iscsilun = bs->opaque; + int64_t cur_length; +diff --git a/block/nfs.c b/block/nfs.c +index cc2413d..2393fbf 100644 +--- a/block/nfs.c ++++ b/block/nfs.c +@@ -755,7 +755,8 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) + + static int coroutine_fn + nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + NFSClient *client = bs->opaque; + int ret; +diff --git a/block/qcow2.c b/block/qcow2.c +index dbd870a..977445e 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3948,7 +3948,7 @@ fail: + + static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVQcow2State *s = bs->opaque; + uint64_t old_length; +diff --git a/block/qed.c b/block/qed.c +index 1af9b3c..fb6100b 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -1467,6 +1467,7 @@ static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs, + int64_t offset, + bool exact, + PreallocMode prealloc, ++ BdrvRequestFlags flags, + Error **errp) + { + BDRVQEDState *s = bs->opaque; +diff --git a/block/raw-format.c b/block/raw-format.c +index 4bb54f4..f994c4a 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -371,7 +371,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) + + static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVRawState *s = bs->opaque; + +diff --git a/block/rbd.c b/block/rbd.c +index 8847259..fcdb60a 100644 +--- a/block/rbd.c ++++ b/block/rbd.c +@@ -1090,6 +1090,7 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, + int64_t offset, + bool exact, + PreallocMode prealloc, ++ BdrvRequestFlags flags, + Error **errp) + { + int r; +diff --git a/block/sheepdog.c b/block/sheepdog.c +index a8a7e32..077aed8 100644 +--- a/block/sheepdog.c ++++ b/block/sheepdog.c +@@ -2288,7 +2288,7 @@ static int64_t sd_getlength(BlockDriverState *bs) + + static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVSheepdogState *s = bs->opaque; + int ret, fd; +@@ -2604,7 +2604,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, + + assert(!flags); + if (offset > s->inode.vdi_size) { +- ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL); ++ ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + return ret; + } +diff --git a/block/ssh.c b/block/ssh.c +index 84e9282..9eb33df 100644 +--- a/block/ssh.c ++++ b/block/ssh.c +@@ -1298,7 +1298,7 @@ static int64_t ssh_getlength(BlockDriverState *bs) + + static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVSSHState *s = bs->opaque; + +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 876a83d..41f13ec 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -356,7 +356,7 @@ struct BlockDriver { + */ + int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp); ++ BdrvRequestFlags flags, Error **errp); + + int64_t (*bdrv_getlength)(BlockDriverState *bs); + bool has_variable_length; +@@ -849,6 +849,14 @@ struct BlockDriverState { + /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, + * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ + unsigned int supported_zero_flags; ++ /* ++ * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE). ++ * ++ * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure ++ * that any added space reads as all zeros. If this can't be guaranteed, ++ * the operation must fail. ++ */ ++ unsigned int supported_truncate_flags; + + /* the following member gives a name to every node on the bs graph. */ + char node_name[32]; +diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c +index 0c86180..2f3b763 100644 +--- a/tests/test-block-iothread.c ++++ b/tests/test-block-iothread.c +@@ -46,7 +46,8 @@ static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs, + + static int coroutine_fn + bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + return 0; + } +-- +1.8.3.1 + diff --git a/kvm-block-Add-flags-to-bdrv-_co-_truncate.patch b/kvm-block-Add-flags-to-bdrv-_co-_truncate.patch new file mode 100755 index 0000000..3da05ff --- /dev/null +++ b/kvm-block-Add-flags-to-bdrv-_co-_truncate.patch @@ -0,0 +1,353 @@ +From 50127f0ff9e13a15fd5bfeb2662e2404ff20f364 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:31 +0100 +Subject: [PATCH 03/17] block: Add flags to bdrv(_co)_truncate() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-3-kwolf@redhat.com> +Patchwork-id: 97445 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 02/11] block: Add flags to bdrv(_co)_truncate() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Now that block drivers can support flags for .bdrv_co_truncate, expose +the parameter in the node level interfaces bdrv_co_truncate() and +bdrv_truncate(). + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-3-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7b8e4857426f2e2de2441749996c6161b550bada) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 2 +- + block/crypto.c | 2 +- + block/io.c | 12 +++++++----- + block/parallels.c | 6 +++--- + block/qcow.c | 4 ++-- + block/qcow2-refcount.c | 2 +- + block/qcow2.c | 15 +++++++++------ + block/raw-format.c | 2 +- + block/vhdx-log.c | 2 +- + block/vhdx.c | 2 +- + block/vmdk.c | 2 +- + include/block/block.h | 5 +++-- + tests/test-block-iothread.c | 6 +++--- + 13 files changed, 34 insertions(+), 28 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 38ae413..8be2006 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -2144,7 +2144,7 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, + return -ENOMEDIUM; + } + +- return bdrv_truncate(blk->root, offset, exact, prealloc, errp); ++ return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp); + } + + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, +diff --git a/block/crypto.c b/block/crypto.c +index 6e4b726..fcb4a97 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -313,7 +313,7 @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, + + offset += payload_offset; + +- return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); ++ return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp); + } + + static void block_crypto_close(BlockDriverState *bs) +diff --git a/block/io.c b/block/io.c +index 549e5a4..3235ce5 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3315,12 +3315,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs) + * 'offset' bytes in length. + */ + int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + BlockDriverState *bs = child->bs; + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; +- BdrvRequestFlags flags = 0; + int64_t old_size, new_bytes; + int ret; + +@@ -3378,7 +3378,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + } + ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); + } else if (bs->file && drv->is_filter) { +- ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); ++ ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + } else { + error_setg(errp, "Image format driver does not support resize"); + ret = -ENOTSUP; +@@ -3411,6 +3411,7 @@ typedef struct TruncateCo { + int64_t offset; + bool exact; + PreallocMode prealloc; ++ BdrvRequestFlags flags; + Error **errp; + int ret; + } TruncateCo; +@@ -3419,12 +3420,12 @@ static void coroutine_fn bdrv_truncate_co_entry(void *opaque) + { + TruncateCo *tco = opaque; + tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact, +- tco->prealloc, tco->errp); ++ tco->prealloc, tco->flags, tco->errp); + aio_wait_kick(); + } + + int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) + { + Coroutine *co; + TruncateCo tco = { +@@ -3432,6 +3433,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, + .offset = offset, + .exact = exact, + .prealloc = prealloc, ++ .flags = flags, + .errp = errp, + .ret = NOT_DONE, + }; +diff --git a/block/parallels.c b/block/parallels.c +index 6d4ed77..2be92cf 100644 +--- a/block/parallels.c ++++ b/block/parallels.c +@@ -203,7 +203,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, + } else { + ret = bdrv_truncate(bs->file, + (s->data_end + space) << BDRV_SECTOR_BITS, +- false, PREALLOC_MODE_OFF, NULL); ++ false, PREALLOC_MODE_OFF, 0, NULL); + } + if (ret < 0) { + return ret; +@@ -493,7 +493,7 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, + * That means we have to pass exact=true. + */ + ret = bdrv_truncate(bs->file, res->image_end_offset, true, +- PREALLOC_MODE_OFF, &local_err); ++ PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + res->check_errors++; +@@ -889,7 +889,7 @@ static void parallels_close(BlockDriverState *bs) + + /* errors are ignored, so we might as well pass exact=true */ + bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + } + + g_free(s->bat_dirty_bmap); +diff --git a/block/qcow.c b/block/qcow.c +index 8973e4e..6b5f226 100644 +--- a/block/qcow.c ++++ b/block/qcow.c +@@ -480,7 +480,7 @@ static int get_cluster_offset(BlockDriverState *bs, + return -E2BIG; + } + ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size, +- false, PREALLOC_MODE_OFF, NULL); ++ false, PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + return ret; + } +@@ -1035,7 +1035,7 @@ static int qcow_make_empty(BlockDriverState *bs) + l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) + return ret; + +diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c +index f67ac6b..3a90d75 100644 +--- a/block/qcow2-refcount.c ++++ b/block/qcow2-refcount.c +@@ -2017,7 +2017,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, + } + + ret = bdrv_truncate(bs->file, offset + s->cluster_size, false, +- PREALLOC_MODE_OFF, &local_err); ++ PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + goto resize_fail; +diff --git a/block/qcow2.c b/block/qcow2.c +index 977445e..c0fdcb9 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3082,7 +3082,7 @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset, + mode = PREALLOC_MODE_OFF; + } + ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false, +- mode, errp); ++ mode, 0, errp); + if (ret < 0) { + return ret; + } +@@ -4044,7 +4044,7 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + * always fulfilled, so there is no need to pass it on.) + */ + bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size, +- false, PREALLOC_MODE_OFF, &local_err); ++ false, PREALLOC_MODE_OFF, 0, &local_err); + if (local_err) { + warn_reportf_err(local_err, + "Failed to truncate the tail of the image: "); +@@ -4066,7 +4066,8 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + * file should be resized to the exact target size, too, + * so we pass @exact here. + */ +- ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp); ++ ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0, ++ errp); + if (ret < 0) { + goto fail; + } +@@ -4152,7 +4153,8 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + new_file_size = allocation_start + + nb_new_data_clusters * s->cluster_size; + /* Image file grows, so @exact does not matter */ +- ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp); ++ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0, ++ errp); + if (ret < 0) { + error_prepend(errp, "Failed to resize underlying file: "); + qcow2_free_clusters(bs, allocation_start, +@@ -4255,7 +4257,8 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs, + if (len < 0) { + return len; + } +- return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL); ++ return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0, ++ NULL); + } + + if (offset_into_cluster(s, offset)) { +@@ -4493,7 +4496,7 @@ static int make_completely_empty(BlockDriverState *bs) + } + + ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false, +- PREALLOC_MODE_OFF, &local_err); ++ PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + goto fail; +diff --git a/block/raw-format.c b/block/raw-format.c +index f994c4a..c3acf9a 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -387,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + + s->size = offset; + offset += s->offset; +- return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); ++ return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp); + } + + static void raw_eject(BlockDriverState *bs, bool eject_flag) +diff --git a/block/vhdx-log.c b/block/vhdx-log.c +index 13a49c2..404fb5f 100644 +--- a/block/vhdx-log.c ++++ b/block/vhdx-log.c +@@ -558,7 +558,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, + goto exit; + } + ret = bdrv_truncate(bs->file, new_file_size, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + goto exit; + } +diff --git a/block/vhdx.c b/block/vhdx.c +index 33e57cd..5dfbb20 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -1264,7 +1264,7 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, + } + + return bdrv_truncate(bs->file, *new_offset + s->block_size, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + } + + /* +diff --git a/block/vmdk.c b/block/vmdk.c +index eb726f2..1bbf937 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -2077,7 +2077,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, + } + length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE); + ret = bdrv_truncate(s->extents[i].file, length, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + return ret; + } +diff --git a/include/block/block.h b/include/block/block.h +index b2a3074..4913596 100644 +--- a/include/block/block.h ++++ b/include/block/block.h +@@ -348,9 +348,10 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + void bdrv_refresh_filename(BlockDriverState *bs); + + int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp); ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp); + int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp); ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + + int64_t bdrv_nb_sectors(BlockDriverState *bs); + int64_t bdrv_getlength(BlockDriverState *bs); +diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c +index 2f3b763..71e9bce 100644 +--- a/tests/test-block-iothread.c ++++ b/tests/test-block-iothread.c +@@ -186,18 +186,18 @@ static void test_sync_op_truncate(BdrvChild *c) + int ret; + + /* Normal success path */ +- ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL); ++ ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL); + g_assert_cmpint(ret, ==, 0); + + /* Early error: Negative offset */ +- ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, NULL); ++ ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, 0, NULL); + g_assert_cmpint(ret, ==, -EINVAL); + + /* Error: Read-only image */ + c->bs->read_only = true; + c->bs->open_flags &= ~BDRV_O_RDWR; + +- ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL); ++ ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL); + g_assert_cmpint(ret, ==, -EACCES); + + c->bs->read_only = false; +-- +1.8.3.1 + diff --git a/kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch b/kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch new file mode 100755 index 0000000..190826f --- /dev/null +++ b/kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch @@ -0,0 +1,105 @@ +From c8ecaea34f03b8ddda7d2b41b0d6f397469c8959 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Wed, 10 Jun 2020 18:32:02 -0400 +Subject: [PATCH 2/2] block: Call attention to truncation of long NBD exports + +RH-Author: Eric Blake +Message-id: <20200610183202.3780750-3-eblake@redhat.com> +Patchwork-id: 97495 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] block: Call attention to truncation of long NBD exports +Bugzilla: 1845384 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Commit 93676c88 relaxed our NBD client code to request export names up +to the NBD protocol maximum of 4096 bytes without NUL terminator, even +though the block layer can't store anything longer than 4096 bytes +including NUL terminator for display to the user. Since this means +there are some export names where we have to truncate things, we can +at least try to make the truncation a bit more obvious for the user. +Note that in spite of the truncated display name, we can still +communicate with an NBD server using such a long export name; this was +deemed nicer than refusing to even connect to such a server (since the +server may not be under our control, and since determining our actual +length limits gets tricky when nbd://host:port/export and +nbd+unix:///export?socket=/path are themselves variable-length +expansions beyond the export name but count towards the block layer +name length). + +Reported-by: Xueqiang Wei +Fixes: https://bugzilla.redhat.com/1843684 +Signed-off-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200610163741.3745251-3-eblake@redhat.com> +(cherry picked from commit 5c86bdf1208916ece0b87e1151c9b48ee54faa3e) +Signed-off-by: Eric Blake +Signed-off-by: Eduardo Lima (Etrunko) +--- + block.c | 7 +++++-- + block/nbd.c | 21 +++++++++++++-------- + 2 files changed, 18 insertions(+), 10 deletions(-) + +diff --git a/block.c b/block.c +index 12c8941879..57740d312e 100644 +--- a/block.c ++++ b/block.c +@@ -6683,8 +6683,11 @@ void bdrv_refresh_filename(BlockDriverState *bs) + pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename); + } else { + QString *json = qobject_to_json(QOBJECT(bs->full_open_options)); +- snprintf(bs->filename, sizeof(bs->filename), "json:%s", +- qstring_get_str(json)); ++ if (snprintf(bs->filename, sizeof(bs->filename), "json:%s", ++ qstring_get_str(json)) >= sizeof(bs->filename)) { ++ /* Give user a hint if we truncated things. */ ++ strcpy(bs->filename + sizeof(bs->filename) - 4, "..."); ++ } + qobject_unref(json); + } + } +diff --git a/block/nbd.c b/block/nbd.c +index 927915d93d..5bb154017d 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -1978,6 +1978,7 @@ static void nbd_refresh_filename(BlockDriverState *bs) + { + BDRVNBDState *s = bs->opaque; + const char *host = NULL, *port = NULL, *path = NULL; ++ size_t len = 0; + + if (s->saddr->type == SOCKET_ADDRESS_TYPE_INET) { + const InetSocketAddress *inet = &s->saddr->u.inet; +@@ -1990,17 +1991,21 @@ static void nbd_refresh_filename(BlockDriverState *bs) + } /* else can't represent as pseudo-filename */ + + if (path && s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd+unix:///%s?socket=%s", s->export, path); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd+unix:///%s?socket=%s", s->export, path); + } else if (path && !s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd+unix://?socket=%s", path); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd+unix://?socket=%s", path); + } else if (host && s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd://%s:%s/%s", host, port, s->export); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd://%s:%s/%s", host, port, s->export); + } else if (host && !s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd://%s:%s", host, port); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd://%s:%s", host, port); ++ } ++ if (len > sizeof(bs->exact_filename)) { ++ /* Name is too long to represent exactly, so leave it empty. */ ++ bs->exact_filename[0] = '\0'; + } + } + +-- +2.27.0 + diff --git a/kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch b/kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch new file mode 100755 index 0000000..b16c0b7 --- /dev/null +++ b/kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch @@ -0,0 +1,84 @@ +From f17b37b58a57d849d2ff5fa04f149d9415803a39 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:17 +0100 +Subject: [PATCH 6/6] block: Fix blk->in_flight during blk_wait_while_drained() + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-7-kwolf@redhat.com> +Patchwork-id: 94599 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 6/6] block: Fix blk->in_flight during blk_wait_while_drained() +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +Waiting in blk_wait_while_drained() while blk->in_flight is increased +for the current request is wrong because it will cause the drain +operation to deadlock. + +This patch makes sure that blk_wait_while_drained() is called with +blk->in_flight increased exactly once for the current request, and that +it temporarily decreases the counter while it waits. + +Fixes: cf3129323f900ef5ddbccbe86e4fa801e88c566e +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Max Reitz +Message-Id: <20200407121259.21350-4-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7f16476fab14fc32388e0ebae793f64673848efa) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 17 +++++------------ + 1 file changed, 5 insertions(+), 12 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 610dbfa..38ae413 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -1140,10 +1140,15 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset, + return 0; + } + ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ + static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) + { ++ assert(blk->in_flight > 0); ++ + if (blk->quiesce_counter && !blk->disable_request_queuing) { ++ blk_dec_in_flight(blk); + qemu_co_queue_wait(&blk->queued_requests, NULL); ++ blk_inc_in_flight(blk); + } + } + +@@ -1418,12 +1423,6 @@ static void blk_aio_read_entry(void *opaque) + BlkRwCo *rwco = &acb->rwco; + QEMUIOVector *qiov = rwco->iobuf; + +- if (rwco->blk->quiesce_counter) { +- blk_dec_in_flight(rwco->blk); +- blk_wait_while_drained(rwco->blk); +- blk_inc_in_flight(rwco->blk); +- } +- + assert(qiov->size == acb->bytes); + rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes, + qiov, rwco->flags); +@@ -1436,12 +1435,6 @@ static void blk_aio_write_entry(void *opaque) + BlkRwCo *rwco = &acb->rwco; + QEMUIOVector *qiov = rwco->iobuf; + +- if (rwco->blk->quiesce_counter) { +- blk_dec_in_flight(rwco->blk); +- blk_wait_while_drained(rwco->blk); +- blk_inc_in_flight(rwco->blk); +- } +- + assert(!qiov || qiov->size == acb->bytes); + rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes, + qiov, 0, rwco->flags); +-- +1.8.3.1 + diff --git a/kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch b/kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch new file mode 100755 index 0000000..0bad890 --- /dev/null +++ b/kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch @@ -0,0 +1,91 @@ +From 5774af5a3c713d0c93010c30453812eae6a749cd Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:37 +0000 +Subject: [PATCH 17/20] block: Fix cross-AioContext blockdev-snapshot + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-12-kwolf@redhat.com> +Patchwork-id: 94286 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 11/13] block: Fix cross-AioContext blockdev-snapshot +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +external_snapshot_prepare() tries to move the overlay to the AioContext +of the backing file (the snapshotted node). However, it's possible that +this doesn't work, but the backing file can instead be moved to the +overlay's AioContext (e.g. opening the backing chain for a mirror +target). + +bdrv_append() already indirectly uses bdrv_attach_node(), which takes +care to move nodes to make sure they use the same AioContext and which +tries both directions. + +So the problem has a simple fix: Just delete the unnecessary extra +bdrv_try_set_aio_context() call in external_snapshot_prepare() and +instead assert in bdrv_append() that both nodes were indeed moved to the +same AioContext. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-6-kwolf@redhat.com> +Tested-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit 30dd65f307b647eef8156c4a33bd007823ef85cb) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 1 + + blockdev.c | 16 ---------------- + 2 files changed, 1 insertion(+), 16 deletions(-) + +diff --git a/block.c b/block.c +index 354d388..ec29b1e 100644 +--- a/block.c ++++ b/block.c +@@ -4327,6 +4327,7 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, + bdrv_ref(from); + + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); ++ assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to)); + bdrv_drained_begin(from); + + /* Put all parents into @list and calculate their cumulative permissions */ +diff --git a/blockdev.c b/blockdev.c +index 7918533..c8d4b51 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1535,9 +1535,7 @@ static void external_snapshot_prepare(BlkActionState *common, + DO_UPCAST(ExternalSnapshotState, common, common); + TransactionAction *action = common->action; + AioContext *aio_context; +- AioContext *old_context; + uint64_t perm, shared; +- int ret; + + /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar + * purpose but a different set of parameters */ +@@ -1678,20 +1676,6 @@ static void external_snapshot_prepare(BlkActionState *common, + goto out; + } + +- /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ +- old_context = bdrv_get_aio_context(state->new_bs); +- aio_context_release(aio_context); +- aio_context_acquire(old_context); +- +- ret = bdrv_try_set_aio_context(state->new_bs, aio_context, errp); +- +- aio_context_release(old_context); +- aio_context_acquire(aio_context); +- +- if (ret < 0) { +- goto out; +- } +- + /* This removes our old bs and adds the new bs. This is an operation that + * can fail, so we need to do it in .prepare; undoing it for abort is + * always possible. */ +-- +1.8.3.1 + diff --git a/kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch b/kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch new file mode 100755 index 0000000..1735dc0 --- /dev/null +++ b/kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch @@ -0,0 +1,60 @@ +From 05452efd7e0fb0522099ae09a396f8f97e66014a Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:47 +0000 +Subject: [PATCH 06/20] block: Fix leak in bdrv_create_file_fallback() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-7-mlevitsk@redhat.com> +Patchwork-id: 94229 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/6] block: Fix leak in bdrv_create_file_fallback() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +@options is leaked by the first two return statements in this function. + +Note that blk_new_open() takes the reference to @options even on +failure, so all we need to do to fix the leak is to move the QDict +allocation down to where we actually need it. + +Reported-by: Coverity (CID 1419884) +Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd + ("block: Generic file creation fallback") +Signed-off-by: Max Reitz +Message-Id: <20200225155618.133412-1-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit eeea1faa099f82328f5831cf252f8ce0a59a9287) +Signed-off-by: Maxim Levitsky + +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/block.c b/block.c +index 3beec7f..e1a4e38 100644 +--- a/block.c ++++ b/block.c +@@ -600,7 +600,7 @@ static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, + QemuOpts *opts, Error **errp) + { + BlockBackend *blk; +- QDict *options = qdict_new(); ++ QDict *options; + int64_t size = 0; + char *buf = NULL; + PreallocMode prealloc; +@@ -623,6 +623,7 @@ static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, + return -ENOTSUP; + } + ++ options = qdict_new(); + qdict_put_str(options, "driver", drv->format_name); + + blk = blk_new_open(filename, NULL, options, +-- +1.8.3.1 + diff --git a/kvm-block-Generic-file-creation-fallback.patch b/kvm-block-Generic-file-creation-fallback.patch new file mode 100755 index 0000000..a5dd1d7 --- /dev/null +++ b/kvm-block-Generic-file-creation-fallback.patch @@ -0,0 +1,227 @@ +From 882d09226b7f45b72c5b7763c4c4aba182e0f8a1 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:43 +0000 +Subject: [PATCH 02/20] block: Generic file creation fallback + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-3-mlevitsk@redhat.com> +Patchwork-id: 94227 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 2/6] block: Generic file creation fallback +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +If a protocol driver does not support image creation, we can see whether +maybe the file exists already. If so, just truncating it will be +sufficient. + +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-3-mreitz@redhat.com> +Signed-off-by: Max Reitz +(cherry picked from commit fd17146cd93d1704cd96d7c2757b325fc7aac6fd) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 147 insertions(+), 12 deletions(-) + +diff --git a/block.c b/block.c +index 2e5e8b6..3beec7f 100644 +--- a/block.c ++++ b/block.c +@@ -532,20 +532,139 @@ out: + return ret; + } + +-int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) ++/** ++ * Helper function for bdrv_create_file_fallback(): Resize @blk to at ++ * least the given @minimum_size. ++ * ++ * On success, return @blk's actual length. ++ * Otherwise, return -errno. ++ */ ++static int64_t create_file_fallback_truncate(BlockBackend *blk, ++ int64_t minimum_size, Error **errp) + { +- BlockDriver *drv; ++ Error *local_err = NULL; ++ int64_t size; ++ int ret; ++ ++ ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, &local_err); ++ if (ret < 0 && ret != -ENOTSUP) { ++ error_propagate(errp, local_err); ++ return ret; ++ } ++ ++ size = blk_getlength(blk); ++ if (size < 0) { ++ error_free(local_err); ++ error_setg_errno(errp, -size, ++ "Failed to inquire the new image file's length"); ++ return size; ++ } ++ ++ if (size < minimum_size) { ++ /* Need to grow the image, but we failed to do that */ ++ error_propagate(errp, local_err); ++ return -ENOTSUP; ++ } ++ ++ error_free(local_err); ++ local_err = NULL; ++ ++ return size; ++} ++ ++/** ++ * Helper function for bdrv_create_file_fallback(): Zero the first ++ * sector to remove any potentially pre-existing image header. ++ */ ++static int create_file_fallback_zero_first_sector(BlockBackend *blk, ++ int64_t current_size, ++ Error **errp) ++{ ++ int64_t bytes_to_clear; ++ int ret; ++ ++ bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE); ++ if (bytes_to_clear) { ++ ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, ++ "Failed to clear the new image's first sector"); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, ++ QemuOpts *opts, Error **errp) ++{ ++ BlockBackend *blk; ++ QDict *options = qdict_new(); ++ int64_t size = 0; ++ char *buf = NULL; ++ PreallocMode prealloc; + Error *local_err = NULL; + int ret; + ++ size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); ++ buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); ++ prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, ++ PREALLOC_MODE_OFF, &local_err); ++ g_free(buf); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ return -EINVAL; ++ } ++ ++ if (prealloc != PREALLOC_MODE_OFF) { ++ error_setg(errp, "Unsupported preallocation mode '%s'", ++ PreallocMode_str(prealloc)); ++ return -ENOTSUP; ++ } ++ ++ qdict_put_str(options, "driver", drv->format_name); ++ ++ blk = blk_new_open(filename, NULL, options, ++ BDRV_O_RDWR | BDRV_O_RESIZE, errp); ++ if (!blk) { ++ error_prepend(errp, "Protocol driver '%s' does not support image " ++ "creation, and opening the image failed: ", ++ drv->format_name); ++ return -EINVAL; ++ } ++ ++ size = create_file_fallback_truncate(blk, size, errp); ++ if (size < 0) { ++ ret = size; ++ goto out; ++ } ++ ++ ret = create_file_fallback_zero_first_sector(blk, size, errp); ++ if (ret < 0) { ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ blk_unref(blk); ++ return ret; ++} ++ ++int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) ++{ ++ BlockDriver *drv; ++ + drv = bdrv_find_protocol(filename, true, errp); + if (drv == NULL) { + return -ENOENT; + } + +- ret = bdrv_create(drv, filename, opts, &local_err); +- error_propagate(errp, local_err); +- return ret; ++ if (drv->bdrv_co_create_opts) { ++ return bdrv_create(drv, filename, opts, errp); ++ } else { ++ return bdrv_create_file_fallback(filename, drv, opts, errp); ++ } + } + + /** +@@ -1422,6 +1541,24 @@ QemuOptsList bdrv_runtime_opts = { + }, + }; + ++static QemuOptsList fallback_create_opts = { ++ .name = "fallback-create-opts", ++ .head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head), ++ .desc = { ++ { ++ .name = BLOCK_OPT_SIZE, ++ .type = QEMU_OPT_SIZE, ++ .help = "Virtual disk size" ++ }, ++ { ++ .name = BLOCK_OPT_PREALLOC, ++ .type = QEMU_OPT_STRING, ++ .help = "Preallocation mode (allowed values: off)" ++ }, ++ { /* end of list */ } ++ } ++}; ++ + /* + * Common part for opening disk images and files + * +@@ -5743,14 +5880,12 @@ void bdrv_img_create(const char *filename, const char *fmt, + return; + } + +- if (!proto_drv->create_opts) { +- error_setg(errp, "Protocol driver '%s' does not support image creation", +- proto_drv->format_name); +- return; +- } +- + create_opts = qemu_opts_append(create_opts, drv->create_opts); +- create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); ++ if (proto_drv->create_opts) { ++ create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); ++ } else { ++ create_opts = qemu_opts_append(create_opts, &fallback_create_opts); ++ } + + /* Create parameter list with default values */ + opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); +-- +1.8.3.1 + diff --git a/kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch b/kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch new file mode 100755 index 0000000..463501a --- /dev/null +++ b/kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch @@ -0,0 +1,295 @@ +From 52cc1d1cd2f695c5761d65baec961d14552a79ed Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:16 +0100 +Subject: [PATCH 5/6] block: Increase BB.in_flight for coroutine and sync + interfaces + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-6-kwolf@redhat.com> +Patchwork-id: 94600 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/6] block: Increase BB.in_flight for coroutine and sync interfaces +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +External callers of blk_co_*() and of the synchronous blk_*() functions +don't currently increase the BlockBackend.in_flight counter, but calls +from blk_aio_*() do, so there is an inconsistency whether the counter +has been increased or not. + +This patch moves the actual operations to static functions that can +later know they will always be called with in_flight increased exactly +once, even for external callers using the blk_co_*() coroutine +interfaces. + +If the public blk_co_*() interface is unused, remove it. + +Signed-off-by: Kevin Wolf +Message-Id: <20200407121259.21350-3-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit fbb92b6798894d3bf62fe3578d99fa62c720b242) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 103 ++++++++++++++++++++++++++++++++--------- + include/sysemu/block-backend.h | 1 - + 2 files changed, 80 insertions(+), 24 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 17b2e87..610dbfa 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -1147,9 +1147,10 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) + } + } + +-int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, +- unsigned int bytes, QEMUIOVector *qiov, +- BdrvRequestFlags flags) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes, ++ QEMUIOVector *qiov, BdrvRequestFlags flags) + { + int ret; + BlockDriverState *bs; +@@ -1178,10 +1179,24 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + return ret; + } + +-int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, +- unsigned int bytes, +- QEMUIOVector *qiov, size_t qiov_offset, +- BdrvRequestFlags flags) ++int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, ++ unsigned int bytes, QEMUIOVector *qiov, ++ BdrvRequestFlags flags) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_preadv(blk, offset, bytes, qiov, flags); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes, ++ QEMUIOVector *qiov, size_t qiov_offset, ++ BdrvRequestFlags flags) + { + int ret; + BlockDriverState *bs; +@@ -1214,6 +1229,20 @@ int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, + return ret; + } + ++int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, ++ unsigned int bytes, ++ QEMUIOVector *qiov, size_t qiov_offset, ++ BdrvRequestFlags flags) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ + int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +@@ -1234,7 +1263,7 @@ static void blk_read_entry(void *opaque) + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size, ++ rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size, + qiov, rwco->flags); + aio_wait_kick(); + } +@@ -1244,8 +1273,8 @@ static void blk_write_entry(void *opaque) + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size, +- qiov, rwco->flags); ++ rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size, ++ qiov, 0, rwco->flags); + aio_wait_kick(); + } + +@@ -1262,6 +1291,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, + .ret = NOT_DONE, + }; + ++ blk_inc_in_flight(blk); + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + co_entry(&rwco); +@@ -1270,6 +1300,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, + bdrv_coroutine_enter(blk_bs(blk), co); + BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE); + } ++ blk_dec_in_flight(blk); + + return rwco.ret; + } +@@ -1394,7 +1425,7 @@ static void blk_aio_read_entry(void *opaque) + } + + assert(qiov->size == acb->bytes); +- rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes, ++ rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes, + qiov, rwco->flags); + blk_aio_complete(acb); + } +@@ -1412,8 +1443,8 @@ static void blk_aio_write_entry(void *opaque) + } + + assert(!qiov || qiov->size == acb->bytes); +- rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes, +- qiov, rwco->flags); ++ rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes, ++ qiov, 0, rwco->flags); + blk_aio_complete(acb); + } + +@@ -1498,7 +1529,9 @@ void blk_aio_cancel_async(BlockAIOCB *acb) + bdrv_aio_cancel_async(acb); + } + +-int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf) + { + blk_wait_while_drained(blk); + +@@ -1514,8 +1547,7 @@ static void blk_ioctl_entry(void *opaque) + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, +- qiov->iov[0].iov_base); ++ rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base); + aio_wait_kick(); + } + +@@ -1529,7 +1561,7 @@ static void blk_aio_ioctl_entry(void *opaque) + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + +- rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf); ++ rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf); + + blk_aio_complete(acb); + } +@@ -1540,7 +1572,9 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque); + } + +-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes) + { + int ret; + +@@ -1559,7 +1593,7 @@ static void blk_aio_pdiscard_entry(void *opaque) + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes); ++ rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes); + blk_aio_complete(acb); + } + +@@ -1571,12 +1605,23 @@ BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, + cb, opaque); + } + ++int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_pdiscard(blk, offset, bytes); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ + static void blk_pdiscard_entry(void *opaque) + { + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); ++ rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size); + aio_wait_kick(); + } + +@@ -1585,7 +1630,8 @@ int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) + return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0); + } + +-int blk_co_flush(BlockBackend *blk) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn blk_do_flush(BlockBackend *blk) + { + blk_wait_while_drained(blk); + +@@ -1601,7 +1647,7 @@ static void blk_aio_flush_entry(void *opaque) + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + +- rwco->ret = blk_co_flush(rwco->blk); ++ rwco->ret = blk_do_flush(rwco->blk); + blk_aio_complete(acb); + } + +@@ -1611,10 +1657,21 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk, + return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque); + } + ++int coroutine_fn blk_co_flush(BlockBackend *blk) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_flush(blk); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ + static void blk_flush_entry(void *opaque) + { + BlkRwCo *rwco = opaque; +- rwco->ret = blk_co_flush(rwco->blk); ++ rwco->ret = blk_do_flush(rwco->blk); + aio_wait_kick(); + } + +diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h +index b198dec..9bbdbd6 100644 +--- a/include/sysemu/block-backend.h ++++ b/include/sysemu/block-backend.h +@@ -171,7 +171,6 @@ BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int bytes, + BlockCompletionFunc *cb, void *opaque); + void blk_aio_cancel(BlockAIOCB *acb); + void blk_aio_cancel_async(BlockAIOCB *acb); +-int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf); + int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf); + BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque); +-- +1.8.3.1 + diff --git a/kvm-block-Introduce-bdrv_reopen_commit_post-step.patch b/kvm-block-Introduce-bdrv_reopen_commit_post-step.patch new file mode 100755 index 0000000..72c8986 --- /dev/null +++ b/kvm-block-Introduce-bdrv_reopen_commit_post-step.patch @@ -0,0 +1,65 @@ +From f7dd953c2d0380cef3c351afb03d68c6fcda1dca Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:28 +0000 +Subject: [PATCH 08/20] block: Introduce 'bdrv_reopen_commit_post' step + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-3-kwolf@redhat.com> +Patchwork-id: 94278 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 02/13] block: Introduce 'bdrv_reopen_commit_post' step +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Peter Krempa + +Add another step in the reopen process where driver can execute code +after permission changes are comitted. + +Signed-off-by: Peter Krempa +Message-Id: +Signed-off-by: Kevin Wolf +(cherry picked from commit 17e1e2be5f9e84e0298e28e70675655b43e225ea) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 9 +++++++++ + include/block/block_int.h | 1 + + 2 files changed, 10 insertions(+) + +diff --git a/block.c b/block.c +index e1a4e38..a744bb5 100644 +--- a/block.c ++++ b/block.c +@@ -3657,6 +3657,15 @@ cleanup_perm: + } + } + } ++ ++ if (ret == 0) { ++ QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) { ++ BlockDriverState *bs = bs_entry->state.bs; ++ ++ if (bs->drv->bdrv_reopen_commit_post) ++ bs->drv->bdrv_reopen_commit_post(&bs_entry->state); ++ } ++ } + cleanup: + QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { + if (ret) { +diff --git a/include/block/block_int.h b/include/block/block_int.h +index dd033d0..c168690 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -123,6 +123,7 @@ struct BlockDriver { + int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp); + void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); ++ void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); + void (*bdrv_join_options)(QDict *options, QDict *old_options); + +-- +1.8.3.1 + diff --git a/kvm-block-Make-bdrv_get_cumulative_perm-public.patch b/kvm-block-Make-bdrv_get_cumulative_perm-public.patch new file mode 100755 index 0000000..2f0f999 --- /dev/null +++ b/kvm-block-Make-bdrv_get_cumulative_perm-public.patch @@ -0,0 +1,67 @@ +From 294ab4c4963295556d12ac15150b48c8536175a7 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:33 +0000 +Subject: [PATCH 13/20] block: Make bdrv_get_cumulative_perm() public + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-8-kwolf@redhat.com> +Patchwork-id: 94287 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 07/13] block: Make bdrv_get_cumulative_perm() public +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-2-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit c7a0f2be8f95b220cdadbba9a9236eaf115951dc) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 6 ++---- + include/block/block_int.h | 3 +++ + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/block.c b/block.c +index 39e4647..354d388 100644 +--- a/block.c ++++ b/block.c +@@ -1850,8 +1850,6 @@ static int bdrv_child_check_perm(BdrvChild *c, BlockReopenQueue *q, + bool *tighten_restrictions, Error **errp); + static void bdrv_child_abort_perm_update(BdrvChild *c); + static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared); +-static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, +- uint64_t *shared_perm); + + typedef struct BlockReopenQueueEntry { + bool prepared; +@@ -2075,8 +2073,8 @@ static void bdrv_set_perm(BlockDriverState *bs, uint64_t cumulative_perms, + } + } + +-static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, +- uint64_t *shared_perm) ++void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, ++ uint64_t *shared_perm) + { + BdrvChild *c; + uint64_t cumulative_perms = 0; +diff --git a/include/block/block_int.h b/include/block/block_int.h +index c168690..96e327b 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -1228,6 +1228,9 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + void *opaque, Error **errp); + void bdrv_root_unref_child(BdrvChild *child); + ++void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, ++ uint64_t *shared_perm); ++ + /** + * Sets a BdrvChild's permissions. Avoid if the parent is a BDS; use + * bdrv_child_refresh_perms() instead and make the parent's +-- +1.8.3.1 + diff --git a/kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch b/kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch new file mode 100755 index 0000000..0d4a000 --- /dev/null +++ b/kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch @@ -0,0 +1,145 @@ +From 41d6c207c482093df8669f7cdcdb49bb25dba741 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:12 +0100 +Subject: [PATCH 07/26] block: Make it easier to learn which BDS support + bitmaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-5-eblake@redhat.com> +Patchwork-id: 97071 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 04/12] block: Make it easier to learn which BDS support bitmaps +Bugzilla: 1779893 1779904 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Upcoming patches will enhance bitmap support in qemu-img, but in doing +so, it turns out to be nice to suppress output when persistent bitmaps +make no sense (such as on a qcow2 v2 image). Add a hook to make this +easier to query. + +This patch adds a new callback .bdrv_supports_persistent_dirty_bitmap, +rather than trying to shoehorn the answer in via existing callbacks. +In particular, while it might have been possible to overload +.bdrv_co_can_store_new_dirty_bitmap to special-case a NULL input to +answer whether any persistent bitmaps are supported, that is at odds +with whether a particular bitmap can be stored (for example, even on +an image that supports persistent bitmaps but has currently filled up +the maximum number of bitmaps, attempts to store another one should +fail); and the new functionality doesn't require coroutine safety. +Similarly, we could have added one more piece of information to +.bdrv_get_info, but then again, most callers to that function tend to +already discard extraneous information, and making it a catch-all +rather than a series of dedicated scalar queries hasn't really +simplified life. + +In the future, when we improve the ability to look up bitmaps through +a filter, we will probably also want to teach the block layer to +automatically let filters pass this request on through. + +Signed-off-by: Eric Blake +Message-Id: <20200513011648.166876-4-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit ef893b5c84f3199d777e33966dc28839f71b1a5c) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + block/dirty-bitmap.c | 9 +++++++++ + block/qcow2-bitmap.c | 7 +++++++ + block/qcow2.c | 2 ++ + block/qcow2.h | 1 + + include/block/block_int.h | 1 + + include/block/dirty-bitmap.h | 1 + + 6 files changed, 21 insertions(+) + +diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c +index 7039e82..2f96acc 100644 +--- a/block/dirty-bitmap.c ++++ b/block/dirty-bitmap.c +@@ -478,6 +478,15 @@ int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, + } + } + ++bool ++bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs) ++{ ++ if (bs->drv && bs->drv->bdrv_supports_persistent_dirty_bitmap) { ++ return bs->drv->bdrv_supports_persistent_dirty_bitmap(bs); ++ } ++ return false; ++} ++ + static bool coroutine_fn + bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, + uint32_t granularity, Error **errp) +diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c +index c6c8ebb..cbac905 100644 +--- a/block/qcow2-bitmap.c ++++ b/block/qcow2-bitmap.c +@@ -1759,3 +1759,10 @@ fail: + name, bdrv_get_device_or_node_name(bs)); + return false; + } ++ ++bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs) ++{ ++ BDRVQcow2State *s = bs->opaque; ++ ++ return s->qcow_version >= 3; ++} +diff --git a/block/qcow2.c b/block/qcow2.c +index af0ad4a..36b0f7d 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -5551,6 +5551,8 @@ BlockDriver bdrv_qcow2 = { + .bdrv_detach_aio_context = qcow2_detach_aio_context, + .bdrv_attach_aio_context = qcow2_attach_aio_context, + ++ .bdrv_supports_persistent_dirty_bitmap = ++ qcow2_supports_persistent_dirty_bitmap, + .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap, + .bdrv_co_remove_persistent_dirty_bitmap = + qcow2_co_remove_persistent_dirty_bitmap, +diff --git a/block/qcow2.h b/block/qcow2.h +index 0942126..ceb1ceb 100644 +--- a/block/qcow2.h ++++ b/block/qcow2.h +@@ -767,6 +767,7 @@ bool qcow2_co_can_store_new_dirty_bitmap(BlockDriverState *bs, + int qcow2_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); ++bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs); + + ssize_t coroutine_fn + qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 562dca1..cc18e8d 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -568,6 +568,7 @@ struct BlockDriver { + uint64_t parent_perm, uint64_t parent_shared, + uint64_t *nperm, uint64_t *nshared); + ++ bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); + bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs, + const char *name, + uint32_t granularity, +diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h +index e2b20ec..f6e9a38 100644 +--- a/include/block/dirty-bitmap.h ++++ b/include/block/dirty-bitmap.h +@@ -16,6 +16,7 @@ typedef enum BitmapCheckFlags { + + #define BDRV_BITMAP_MAX_NAME_SIZE 1023 + ++bool bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs); + BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, +-- +1.8.3.1 + diff --git a/kvm-block-Relax-restrictions-for-blockdev-snapshot.patch b/kvm-block-Relax-restrictions-for-blockdev-snapshot.patch new file mode 100755 index 0000000..de85205 --- /dev/null +++ b/kvm-block-Relax-restrictions-for-blockdev-snapshot.patch @@ -0,0 +1,117 @@ +From 9ba321e18a357c1a3a238ceee301bbb174f96eee Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:34 +0000 +Subject: [PATCH 14/20] block: Relax restrictions for blockdev-snapshot + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-9-kwolf@redhat.com> +Patchwork-id: 94285 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 08/13] block: Relax restrictions for blockdev-snapshot +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +blockdev-snapshot returned an error if the overlay was already in use, +which it defined as having any BlockBackend parent. This is in fact both +too strict (some parents can tolerate the change of visible data caused +by attaching a backing file) and too loose (some non-BlockBackend +parents may not be happy with it). + +One important use case that is prevented by the too strict check is live +storage migration with blockdev-mirror. Here, the target node is +usually opened without a backing file so that the active layer is +mirrored while its backing chain can be copied in the background. + +The backing chain should be attached to the mirror target node when +finalising the job, just before switching the users of the source node +to the new copy (at which point the mirror job still has a reference to +the node). drive-mirror did this automatically, but with blockdev-mirror +this is the job of the QMP client, so it needs a way to do this. + +blockdev-snapshot is the obvious way, so this patch makes it work in +this scenario. The new condition is that no parent uses CONSISTENT_READ +permissions. This will ensure that the operation will still be blocked +when the node is attached to the guest device, so blockdev-snapshot +remains safe. + +(For the sake of completeness, x-blockdev-reopen can be used to achieve +the same, however it is a big hammer, performs the graph change +completely unchecked and is still experimental. So even with the option +of using x-blockdev-reopen, there are reasons why blockdev-snapshot +should be able to perform this operation.) + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-3-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Tested-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit d29d3d1f80b3947fb26e7139645c83de66d146a9) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 14 ++++++++------ + tests/qemu-iotests/085.out | 4 ++-- + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 4cd9a58..7918533 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1536,6 +1536,7 @@ static void external_snapshot_prepare(BlkActionState *common, + TransactionAction *action = common->action; + AioContext *aio_context; + AioContext *old_context; ++ uint64_t perm, shared; + int ret; + + /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar +@@ -1656,16 +1657,17 @@ static void external_snapshot_prepare(BlkActionState *common, + goto out; + } + +- if (bdrv_has_blk(state->new_bs)) { ++ /* ++ * Allow attaching a backing file to an overlay that's already in use only ++ * if the parents don't assume that they are already seeing a valid image. ++ * (Specifically, allow it as a mirror target, which is write-only access.) ++ */ ++ bdrv_get_cumulative_perm(state->new_bs, &perm, &shared); ++ if (perm & BLK_PERM_CONSISTENT_READ) { + error_setg(errp, "The overlay is already in use"); + goto out; + } + +- if (bdrv_op_is_blocked(state->new_bs, BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, +- errp)) { +- goto out; +- } +- + if (state->new_bs->backing != NULL) { + error_setg(errp, "The overlay already has a backing image"); + goto out; +diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out +index bb50227..487d920 100644 +--- a/tests/qemu-iotests/085.out ++++ b/tests/qemu-iotests/085.out +@@ -82,7 +82,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/ + === Invalid command - cannot create a snapshot using a file BDS === + + { 'execute': 'blockdev-snapshot', 'arguments': { 'node':'virtio0', 'overlay':'file_12' } } +-{"error": {"class": "GenericError", "desc": "The overlay does not support backing images"}} ++{"error": {"class": "GenericError", "desc": "The overlay is already in use"}} + + === Invalid command - snapshot node used as active layer === + +@@ -96,7 +96,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/ + === Invalid command - snapshot node used as backing hd === + + { 'execute': 'blockdev-snapshot', 'arguments': { 'node': 'virtio0', 'overlay':'snap_11' } } +-{"error": {"class": "GenericError", "desc": "Node 'snap_11' is busy: node is used as backing hd of 'snap_12'"}} ++{"error": {"class": "GenericError", "desc": "The overlay is already in use"}} + + === Invalid command - snapshot node has a backing image === + +-- +1.8.3.1 + diff --git a/kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch b/kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch new file mode 100755 index 0000000..1188911 --- /dev/null +++ b/kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch @@ -0,0 +1,77 @@ +From e191ab6358b656764374ff1b3c7224a744dc902a Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Tue, 26 Jan 2021 17:21:02 -0500 +Subject: [PATCH 7/9] block: Require aligned image size to avoid assertion + failure + +RH-Author: Kevin Wolf +Message-id: <20210126172103.136060-2-kwolf@redhat.com> +Patchwork-id: 100786 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/2] block: Require aligned image size to avoid assertion failure +Bugzilla: 1834281 +RH-Acked-by: Markus Armbruster +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +Unaligned requests will automatically be aligned to bl.request_alignment +and we can't extend write requests to access space beyond the end of the +image without resizing the image, so if we have the WRITE permission, +but not the RESIZE one, it's required that the image size is aligned. + +Failing to meet this requirement could cause assertion failures like +this if RESIZE permissions weren't requested: + +qemu-img: block/io.c:1910: bdrv_co_write_req_prepare: Assertion `end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE' failed. + +This was e.g. triggered by qemu-img converting to a target image with 4k +request alignment when the image was only aligned to 512 bytes, but not +to 4k. + +Turn this into a graceful error in bdrv_check_perm() so that WRITE +without RESIZE can only be taken if the image size is aligned. If a user +holds both permissions and drops only RESIZE, the function will return +an error, but bdrv_child_try_set_perm() will ignore the failure silently +if permissions are only requested to be relaxed and just keep both +permissions while returning success. + +Signed-off-by: Kevin Wolf +Message-Id: <20200716142601.111237-2-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 9c60a5d1978e6dcf85c0e01b50e6f7f54ca09104) +Signed-off-by: Kevin Wolf +Signed-off-by: Jon Maloy +--- + block.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/block.c b/block.c +index 57740d312e..e9579ddf84 100644 +--- a/block.c ++++ b/block.c +@@ -2009,6 +2009,22 @@ static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q, + return -EPERM; + } + ++ /* ++ * Unaligned requests will automatically be aligned to bl.request_alignment ++ * and without RESIZE we can't extend requests to write to space beyond the ++ * end of the image, so it's required that the image size is aligned. ++ */ ++ if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) && ++ !(cumulative_perms & BLK_PERM_RESIZE)) ++ { ++ if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) { ++ error_setg(errp, "Cannot get 'write' permission without 'resize': " ++ "Image size is not a multiple of request " ++ "alignment"); ++ return -EPERM; ++ } ++ } ++ + /* Check this node */ + if (!drv) { + return 0; +-- +2.18.2 + diff --git a/kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch b/kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch new file mode 100755 index 0000000..ea796d5 --- /dev/null +++ b/kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch @@ -0,0 +1,57 @@ +From 371d312300251c0dc24522607b06b7e47e760b53 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:32 +0000 +Subject: [PATCH 12/20] block: Versioned x-blockdev-reopen API with feature + flag + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-7-kwolf@redhat.com> +Patchwork-id: 94283 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 06/13] block: Versioned x-blockdev-reopen API with feature flag +Bugzilla: 1790482 1805143 +RH-Acked-by: Eric Blake +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +x-blockdev-reopen is still considered unstable upstream. libvirt needs +(a small subset of) it for incremental backups, though. + +Add a downstream-only feature flag that effectively makes this a +versioned interface. As long as the feature is present, we promise that +we won't change the interface incompatibly. Incompatible changes to the +command will require us to drop the feature flag (and possibly introduce +a new one if the new version is still not stable upstream). + +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + qapi/block-core.json | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/qapi/block-core.json b/qapi/block-core.json +index 0cf68fe..a1e85b0 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -4202,10 +4202,17 @@ + # image does not have a default backing file name as part of its + # metadata. + # ++# Features: ++# @__com.redhat_rhel-av-8_2_0-api: Versioning the downstream interface while ++# it's still unstable upstream. As long as ++# this flag is present, this command will not ++# change incompatibly. ++# + # Since: 4.0 + ## + { 'command': 'x-blockdev-reopen', +- 'data': 'BlockdevOptions', 'boxed': true } ++ 'data': 'BlockdevOptions', 'boxed': true, ++ 'features': [ '__com.redhat_rhel-av-8_2_0-api' ] } + + ## + # @blockdev-del: +-- +1.8.3.1 + diff --git a/kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch b/kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch new file mode 100755 index 0000000..d1511d2 --- /dev/null +++ b/kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch @@ -0,0 +1,308 @@ +From 67f36d057aa71ca56ebc17ef28a7cb70bac6c6b6 Mon Sep 17 00:00:00 2001 +From: "Daniel P. Berrange" +Date: Tue, 5 May 2020 16:46:01 +0100 +Subject: [PATCH 01/12] block: always fill entire LUKS header space with zeros +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Daniel P. Berrange +Message-id: <20200505164601.1059974-2-berrange@redhat.com> +Patchwork-id: 96277 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] block: always fill entire LUKS header space with zeros +Bugzilla: 1775462 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi + +When initializing the LUKS header the size with default encryption +parameters will currently be 2068480 bytes. This is rounded up to +a multiple of the cluster size, 2081792, with 64k sectors. If the +end of the header is not the same as the end of the cluster we fill +the extra space with zeros. This was forgetting that not even the +space allocated for the header will be fully initialized, as we +only write key material for the first key slot. The space left +for the other 7 slots is never written to. + +An optimization to the ref count checking code: + + commit a5fff8d4b4d928311a5005efa12d0991fe3b66f9 (refs/bisect/bad) + Author: Vladimir Sementsov-Ogievskiy + Date: Wed Feb 27 16:14:30 2019 +0300 + + qcow2-refcount: avoid eating RAM + +made the assumption that every cluster which was allocated would +have at least some data written to it. This was violated by way +the LUKS header is only partially written, with much space simply +reserved for future use. + +Depending on the cluster size this problem was masked by the +logic which wrote zeros between the end of the LUKS header and +the end of the cluster. + +$ qemu-img create --object secret,id=cluster_encrypt0,data=123456 \ + -f qcow2 -o cluster_size=2k,encrypt.iter-time=1,\ + encrypt.format=luks,encrypt.key-secret=cluster_encrypt0 \ + cluster_size_check.qcow2 100M + Formatting 'cluster_size_check.qcow2', fmt=qcow2 size=104857600 + encrypt.format=luks encrypt.key-secret=cluster_encrypt0 + encrypt.iter-time=1 cluster_size=2048 lazy_refcounts=off refcount_bits=16 + +$ qemu-img check --object secret,id=cluster_encrypt0,data=redhat \ + 'json:{"driver": "qcow2", "encrypt.format": "luks", \ + "encrypt.key-secret": "cluster_encrypt0", \ + "file.driver": "file", "file.filename": "cluster_size_check.qcow2"}' +ERROR: counting reference for region exceeding the end of the file by one cluster or more: offset 0x2000 size 0x1f9000 +Leaked cluster 4 refcount=1 reference=0 +...snip... +Leaked cluster 130 refcount=1 reference=0 + +1 errors were found on the image. +Data may be corrupted, or further writes to the image may corrupt it. + +127 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. +Image end offset: 268288 + +The problem only exists when the disk image is entirely empty. Writing +data to the disk image payload will solve the problem by causing the +end of the file to be extended further. + +The change fixes it by ensuring that the entire allocated LUKS header +region is fully initialized with zeros. The qemu-img check will still +fail for any pre-existing disk images created prior to this change, +unless at least 1 byte of the payload is written to. + +Fully writing zeros to the entire LUKS header is a good idea regardless +as it ensures that space has been allocated on the host filesystem (or +whatever block storage backend is used). + +Signed-off-by: Daniel P. Berrangé +Message-Id: <20200207135520.2669430-1-berrange@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Max Reitz +(cherry picked from commit 087ab8e775f48766068e65de1bc99d03b40d1670) +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + tests/qemu-iotests/group: no test 283 in downstream + +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2.c | 11 ++++-- + tests/qemu-iotests/284 | 97 ++++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/284.out | 62 +++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 4 files changed, 167 insertions(+), 4 deletions(-) + create mode 100755 tests/qemu-iotests/284 + create mode 100644 tests/qemu-iotests/284.out + +diff --git a/block/qcow2.c b/block/qcow2.c +index 71067c6..af0ad4a 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -135,13 +135,16 @@ static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, + s->crypto_header.length = headerlen; + s->crypto_header.offset = ret; + +- /* Zero fill remaining space in cluster so it has predictable +- * content in case of future spec changes */ ++ /* ++ * Zero fill all space in cluster so it has predictable ++ * content, as we may not initialize some regions of the ++ * header (eg only 1 out of 8 key slots will be initialized) ++ */ + clusterlen = size_to_clusters(s, headerlen) * s->cluster_size; + assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0); + ret = bdrv_pwrite_zeroes(bs->file, +- ret + headerlen, +- clusterlen - headerlen, 0); ++ ret, ++ clusterlen, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not zero fill encryption header"); + return -1; +diff --git a/tests/qemu-iotests/284 b/tests/qemu-iotests/284 +new file mode 100755 +index 0000000..071e89b +--- /dev/null ++++ b/tests/qemu-iotests/284 +@@ -0,0 +1,97 @@ ++#!/usr/bin/env bash ++# ++# Test ref count checks on encrypted images ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++# creator ++owner=berrange@redhat.com ++ ++seq=`basename $0` ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++ ++_supported_fmt qcow2 ++_supported_proto generic ++_supported_os Linux ++ ++ ++size=1M ++ ++SECRET="secret,id=sec0,data=astrochicken" ++ ++IMGSPEC="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0" ++QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT ++ ++_run_test() ++{ ++ IMGOPTSSYNTAX=true ++ OLD_TEST_IMG="$TEST_IMG" ++ TEST_IMG="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0" ++ QEMU_IMG_EXTRA_ARGS="--image-opts --object $SECRET" ++ ++ echo ++ echo "== cluster size $csize" ++ echo "== checking image refcounts ==" ++ _check_test_img ++ ++ echo ++ echo "== writing some data ==" ++ $QEMU_IO -c "write -P 0x9 0 1" $QEMU_IMG_EXTRA_ARGS $TEST_IMG | _filter_qemu_io | _filter_testdir ++ echo ++ echo "== rechecking image refcounts ==" ++ _check_test_img ++ ++ echo ++ echo "== writing some more data ==" ++ $QEMU_IO -c "write -P 0x9 $csize 1" $QEMU_IMG_EXTRA_ARGS $TEST_IMG | _filter_qemu_io | _filter_testdir ++ echo ++ echo "== rechecking image refcounts ==" ++ _check_test_img ++ ++ TEST_IMG="$OLD_TEST_IMG" ++ QEMU_IMG_EXTRA_ARGS= ++ IMGOPTSSYNTAX= ++} ++ ++ ++echo ++echo "testing LUKS qcow2 encryption" ++echo ++ ++for csize in 512 2048 32768 ++do ++ _make_test_img --object $SECRET -o "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=$csize" $size ++ _run_test ++ _cleanup_test_img ++done ++ ++# success, all done ++echo "*** done" ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/284.out b/tests/qemu-iotests/284.out +new file mode 100644 +index 0000000..48216f5 +--- /dev/null ++++ b/tests/qemu-iotests/284.out +@@ -0,0 +1,62 @@ ++QA output created by 284 ++ ++testing LUKS qcow2 encryption ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 ++ ++== cluster size 512 ++== checking image refcounts == ++No errors were found on the image. ++ ++== writing some data == ++wrote 1/1 bytes at offset 0 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++ ++== writing some more data == ++wrote 1/1 bytes at offset 512 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 ++ ++== cluster size 2048 ++== checking image refcounts == ++No errors were found on the image. ++ ++== writing some data == ++wrote 1/1 bytes at offset 0 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++ ++== writing some more data == ++wrote 1/1 bytes at offset 2048 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 ++ ++== cluster size 32768 ++== checking image refcounts == ++No errors were found on the image. ++ ++== writing some data == ++wrote 1/1 bytes at offset 0 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++ ++== writing some more data == ++wrote 1/1 bytes at offset 32768 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++*** done +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index e47cbfc..9c565cf 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -289,3 +289,4 @@ + 277 rw quick + 280 rw migration quick + 281 rw quick ++284 rw +-- +1.8.3.1 + diff --git a/kvm-block-backend-Add-flags-to-blk_truncate.patch b/kvm-block-backend-Add-flags-to-blk_truncate.patch new file mode 100755 index 0000000..5b212fc --- /dev/null +++ b/kvm-block-backend-Add-flags-to-blk_truncate.patch @@ -0,0 +1,294 @@ +From 07a93e74efa4861f54dd3d4bec01885f7af2fee3 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 17:01:32 +0200 +Subject: [PATCH 04/17] block-backend: Add flags to blk_truncate() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-4-kwolf@redhat.com> +Patchwork-id: 97450 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 03/11] block-backend: Add flags to blk_truncate() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Now that node level interface bdrv_truncate() supports passing request +flags to the block driver, expose this on the BlockBackend level, too. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-4-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 8c6242b6f383e43fd11d2c50f8bcdd2bba1100fc) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 3 ++- + block/block-backend.c | 4 ++-- + block/commit.c | 4 ++-- + block/crypto.c | 2 +- + block/mirror.c | 2 +- + block/qcow2.c | 4 ++-- + block/qed.c | 2 +- + block/vdi.c | 2 +- + block/vhdx.c | 4 ++-- + block/vmdk.c | 6 +++--- + block/vpc.c | 2 +- + blockdev.c | 2 +- + include/sysemu/block-backend.h | 2 +- + qemu-img.c | 2 +- + qemu-io-cmds.c | 2 +- + 15 files changed, 22 insertions(+), 21 deletions(-) + +diff --git a/block.c b/block.c +index d6a05da..12c8941 100644 +--- a/block.c ++++ b/block.c +@@ -547,7 +547,8 @@ static int64_t create_file_fallback_truncate(BlockBackend *blk, + int64_t size; + int ret; + +- ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, &local_err); ++ ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0, ++ &local_err); + if (ret < 0 && ret != -ENOTSUP) { + error_propagate(errp, local_err); + return ret; +diff --git a/block/block-backend.c b/block/block-backend.c +index 8be2006..17ed6d8 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -2137,14 +2137,14 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, + } + + int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) + { + if (!blk_is_available(blk)) { + error_setg(errp, "No medium inserted"); + return -ENOMEDIUM; + } + +- return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp); ++ return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp); + } + + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, +diff --git a/block/commit.c b/block/commit.c +index 23c90b3..075ebf8 100644 +--- a/block/commit.c ++++ b/block/commit.c +@@ -155,7 +155,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) + } + + if (base_len < len) { +- ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL); ++ ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL); + if (ret) { + goto out; + } +@@ -471,7 +471,7 @@ int bdrv_commit(BlockDriverState *bs) + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { +- ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, ++ ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0, + &local_err); + if (ret < 0) { + error_report_err(local_err); +diff --git a/block/crypto.c b/block/crypto.c +index fcb4a97..83a8fc0 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -115,7 +115,7 @@ static ssize_t block_crypto_init_func(QCryptoBlock *block, + * which will be used by the crypto header + */ + return blk_truncate(data->blk, data->size + headerlen, false, +- data->prealloc, errp); ++ data->prealloc, 0, errp); + } + + +diff --git a/block/mirror.c b/block/mirror.c +index 0d32fca..c8028cd 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -886,7 +886,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + if (s->base == blk_bs(s->target)) { + if (s->bdev_length > target_length) { + ret = blk_truncate(s->target, s->bdev_length, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + goto immediate_exit; + } +diff --git a/block/qcow2.c b/block/qcow2.c +index c0fdcb9..86aa74a 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3497,7 +3497,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) + + /* Okay, now that we have a valid image, let's give it the right size */ + ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation, +- errp); ++ 0, errp); + if (ret < 0) { + error_prepend(errp, "Could not resize image: "); + goto out; +@@ -5347,7 +5347,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, + * Amending image options should ensure that the image has + * exactly the given new values, so pass exact=true here. + */ +- ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp); + blk_unref(blk); + if (ret < 0) { + return ret; +diff --git a/block/qed.c b/block/qed.c +index fb6100b..b0fdb8f 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -677,7 +677,7 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts, + * The QED format associates file length with allocation status, + * so a new file (which is empty) must have a length of 0. + */ +- ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto out; + } +diff --git a/block/vdi.c b/block/vdi.c +index e1a11f2..0c7835a 100644 +--- a/block/vdi.c ++++ b/block/vdi.c +@@ -875,7 +875,7 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options, + + if (image_type == VDI_TYPE_STATIC) { + ret = blk_truncate(blk, offset + blocks * block_size, false, +- PREALLOC_MODE_OFF, errp); ++ PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + error_prepend(errp, "Failed to statically allocate file"); + goto exit; +diff --git a/block/vhdx.c b/block/vhdx.c +index 5dfbb20..21497f7 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -1703,13 +1703,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, + /* All zeroes, so we can just extend the file - the end of the BAT + * is the furthest thing we have written yet */ + ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF, +- errp); ++ 0, errp); + if (ret < 0) { + goto exit; + } + } else if (type == VHDX_TYPE_FIXED) { + ret = blk_truncate(blk, data_file_offset + image_size, false, +- PREALLOC_MODE_OFF, errp); ++ PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto exit; + } +diff --git a/block/vmdk.c b/block/vmdk.c +index 1bbf937..1bd3991 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -2118,7 +2118,7 @@ static int vmdk_init_extent(BlockBackend *blk, + int gd_buf_size; + + if (flat) { +- ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp); + goto exit; + } + magic = cpu_to_be32(VMDK4_MAGIC); +@@ -2182,7 +2182,7 @@ static int vmdk_init_extent(BlockBackend *blk, + } + + ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false, +- PREALLOC_MODE_OFF, errp); ++ PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto exit; + } +@@ -2523,7 +2523,7 @@ static int coroutine_fn vmdk_co_do_create(int64_t size, + /* bdrv_pwrite write padding zeros to align to sector, we don't need that + * for description file */ + if (desc_offset == 0) { +- ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto exit; + } +diff --git a/block/vpc.c b/block/vpc.c +index 6df75e2..d5e7dc8 100644 +--- a/block/vpc.c ++++ b/block/vpc.c +@@ -898,7 +898,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, + /* Add footer to total size */ + total_size += HEADER_SIZE; + +- ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + return ret; + } +diff --git a/blockdev.c b/blockdev.c +index 5128c9b..6dde52a 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -3055,7 +3055,7 @@ void qmp_block_resize(bool has_device, const char *device, + } + + bdrv_drained_begin(bs); +- ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp); + bdrv_drained_end(bs); + + out: +diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h +index 9bbdbd6..34de7fa 100644 +--- a/include/sysemu/block-backend.h ++++ b/include/sysemu/block-backend.h +@@ -237,7 +237,7 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, + int bytes); + int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp); ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes); + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size); +diff --git a/qemu-img.c b/qemu-img.c +index 6dc881b..a27ad70 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -3939,7 +3939,7 @@ static int img_resize(int argc, char **argv) + * resizing, so pass @exact=true. It is of no use to report + * success when the image has not actually been resized. + */ +- ret = blk_truncate(blk, total_size, true, prealloc, &err); ++ ret = blk_truncate(blk, total_size, true, prealloc, 0, &err); + if (!ret) { + qprintf(quiet, "Image resized.\n"); + } else { +diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c +index 1b7e700..851f07e 100644 +--- a/qemu-io-cmds.c ++++ b/qemu-io-cmds.c +@@ -1715,7 +1715,7 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv) + * exact=true. It is better to err on the "emit more errors" side + * than to be overly permissive. + */ +- ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, &local_err); ++ ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + return ret; +-- +1.8.3.1 + diff --git a/kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch b/kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch new file mode 100755 index 0000000..9d49cfa --- /dev/null +++ b/kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch @@ -0,0 +1,158 @@ +From 6cc456c4c1e6557fdc7e138e8ef8171b71609222 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:15 +0100 +Subject: [PATCH 4/6] block-backend: Reorder flush/pdiscard function + definitions + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-5-kwolf@redhat.com> +Patchwork-id: 94598 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/6] block-backend: Reorder flush/pdiscard function definitions +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +Move all variants of the flush/pdiscard functions to a single place and +put the blk_co_*() version first because it is called by all other +variants (and will become static in the next patch). + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Max Reitz +Message-Id: <20200407121259.21350-2-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 564806c529d4e0acad209b1e5b864a8886092f1f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 92 +++++++++++++++++++++++++-------------------------- + 1 file changed, 46 insertions(+), 46 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 8b8f2a8..17b2e87 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -1488,38 +1488,6 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, + blk_aio_write_entry, flags, cb, opaque); + } + +-static void blk_aio_flush_entry(void *opaque) +-{ +- BlkAioEmAIOCB *acb = opaque; +- BlkRwCo *rwco = &acb->rwco; +- +- rwco->ret = blk_co_flush(rwco->blk); +- blk_aio_complete(acb); +-} +- +-BlockAIOCB *blk_aio_flush(BlockBackend *blk, +- BlockCompletionFunc *cb, void *opaque) +-{ +- return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque); +-} +- +-static void blk_aio_pdiscard_entry(void *opaque) +-{ +- BlkAioEmAIOCB *acb = opaque; +- BlkRwCo *rwco = &acb->rwco; +- +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes); +- blk_aio_complete(acb); +-} +- +-BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, +- int64_t offset, int bytes, +- BlockCompletionFunc *cb, void *opaque) +-{ +- return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0, +- cb, opaque); +-} +- + void blk_aio_cancel(BlockAIOCB *acb) + { + bdrv_aio_cancel(acb); +@@ -1586,6 +1554,37 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) + return bdrv_co_pdiscard(blk->root, offset, bytes); + } + ++static void blk_aio_pdiscard_entry(void *opaque) ++{ ++ BlkAioEmAIOCB *acb = opaque; ++ BlkRwCo *rwco = &acb->rwco; ++ ++ rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes); ++ blk_aio_complete(acb); ++} ++ ++BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, ++ int64_t offset, int bytes, ++ BlockCompletionFunc *cb, void *opaque) ++{ ++ return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0, ++ cb, opaque); ++} ++ ++static void blk_pdiscard_entry(void *opaque) ++{ ++ BlkRwCo *rwco = opaque; ++ QEMUIOVector *qiov = rwco->iobuf; ++ ++ rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); ++ aio_wait_kick(); ++} ++ ++int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) ++{ ++ return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0); ++} ++ + int blk_co_flush(BlockBackend *blk) + { + blk_wait_while_drained(blk); +@@ -1597,6 +1596,21 @@ int blk_co_flush(BlockBackend *blk) + return bdrv_co_flush(blk_bs(blk)); + } + ++static void blk_aio_flush_entry(void *opaque) ++{ ++ BlkAioEmAIOCB *acb = opaque; ++ BlkRwCo *rwco = &acb->rwco; ++ ++ rwco->ret = blk_co_flush(rwco->blk); ++ blk_aio_complete(acb); ++} ++ ++BlockAIOCB *blk_aio_flush(BlockBackend *blk, ++ BlockCompletionFunc *cb, void *opaque) ++{ ++ return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque); ++} ++ + static void blk_flush_entry(void *opaque) + { + BlkRwCo *rwco = opaque; +@@ -2083,20 +2097,6 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, + return bdrv_truncate(blk->root, offset, exact, prealloc, errp); + } + +-static void blk_pdiscard_entry(void *opaque) +-{ +- BlkRwCo *rwco = opaque; +- QEMUIOVector *qiov = rwco->iobuf; +- +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); +- aio_wait_kick(); +-} +- +-int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) +-{ +- return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0); +-} +- + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size) + { +-- +1.8.3.1 + diff --git a/kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch b/kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch new file mode 100755 index 0000000..45f506c --- /dev/null +++ b/kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch @@ -0,0 +1,130 @@ +From aefff389c4d11bd69180db7177135c4645a9b1bd Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:46 +0000 +Subject: [PATCH 13/18] block/backup-top: Don't acquire context while dropping + top + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-7-slp@redhat.com> +Patchwork-id: 93759 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/9] block/backup-top: Don't acquire context while dropping top +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +All paths that lead to bdrv_backup_top_drop(), except for the call +from backup_clean(), imply that the BDS AioContext has already been +acquired, so doing it there too can potentially lead to QEMU hanging +on AIO_WAIT_WHILE(). + +An easy way to trigger this situation is by issuing a two actions +transaction, with a proper and a bogus blockdev-backup, so the second +one will trigger a rollback. This will trigger a hang with an stack +trace like this one: + + #0 0x00007fb680c75016 in __GI_ppoll (fds=0x55e74580f7c0, nfds=1, timeout=, + timeout@entry=0x0, sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:39 + #1 0x000055e743386e09 in ppoll (__ss=0x0, __timeout=0x0, __nfds=, __fds=) + at /usr/include/bits/poll2.h:77 + #2 0x000055e743386e09 in qemu_poll_ns + (fds=, nfds=, timeout=) at util/qemu-timer.c:336 + #3 0x000055e743388dc4 in aio_poll (ctx=0x55e7458925d0, blocking=blocking@entry=true) + at util/aio-posix.c:669 + #4 0x000055e743305dea in bdrv_flush (bs=bs@entry=0x55e74593c0d0) at block/io.c:2878 + #5 0x000055e7432be58e in bdrv_close (bs=0x55e74593c0d0) at block.c:4017 + #6 0x000055e7432be58e in bdrv_delete (bs=) at block.c:4262 + #7 0x000055e7432be58e in bdrv_unref (bs=bs@entry=0x55e74593c0d0) at block.c:5644 + #8 0x000055e743316b9b in bdrv_backup_top_drop (bs=bs@entry=0x55e74593c0d0) at block/backup-top.c:273 + #9 0x000055e74331461f in backup_job_create + (job_id=0x0, bs=bs@entry=0x55e7458d5820, target=target@entry=0x55e74589f640, speed=0, sync_mode=MIRROR_SYNC_MODE_FULL, sync_bitmap=sync_bitmap@entry=0x0, bitmap_mode=BITMAP_SYNC_MODE_ON_SUCCESS, compress=false, filter_node_name=0x0, on_source_error=BLOCKDEV_ON_ERROR_REPORT, on_target_error=BLOCKDEV_ON_ERROR_REPORT, creation_flags=0, cb=0x0, opaque=0x0, txn=0x0, errp=0x7ffddfd1efb0) at block/backup.c:478 + #10 0x000055e74315bc52 in do_backup_common + (backup=backup@entry=0x55e746c066d0, bs=bs@entry=0x55e7458d5820, target_bs=target_bs@entry=0x55e74589f640, aio_context=aio_context@entry=0x55e7458a91e0, txn=txn@entry=0x0, errp=errp@entry=0x7ffddfd1efb0) + at blockdev.c:3580 + #11 0x000055e74315c37c in do_blockdev_backup + (backup=backup@entry=0x55e746c066d0, txn=0x0, errp=errp@entry=0x7ffddfd1efb0) + at /usr/src/debug/qemu-kvm-4.2.0-2.module+el8.2.0+5135+ed3b2489.x86_64/./qapi/qapi-types-block-core.h:1492 + #12 0x000055e74315c449 in blockdev_backup_prepare (common=0x55e746a8de90, errp=0x7ffddfd1f018) + at blockdev.c:1885 + #13 0x000055e743160152 in qmp_transaction + (dev_list=, has_props=, props=0x55e7467fe2c0, errp=errp@entry=0x7ffddfd1f088) at blockdev.c:2340 + #14 0x000055e743287ff5 in qmp_marshal_transaction + (args=, ret=, errp=0x7ffddfd1f0f8) + at qapi/qapi-commands-transaction.c:44 + #15 0x000055e74333de6c in do_qmp_dispatch + (errp=0x7ffddfd1f0f0, allow_oob=, request=, cmds=0x55e743c28d60 ) at qapi/qmp-dispatch.c:132 + #16 0x000055e74333de6c in qmp_dispatch + (cmds=0x55e743c28d60 , request=, allow_oob=) + at qapi/qmp-dispatch.c:175 + #17 0x000055e74325c061 in monitor_qmp_dispatch (mon=0x55e745908030, req=) + at monitor/qmp.c:145 + #18 0x000055e74325c6fa in monitor_qmp_bh_dispatcher (data=) at monitor/qmp.c:234 + #19 0x000055e743385866 in aio_bh_call (bh=0x55e745807ae0) at util/async.c:117 + #20 0x000055e743385866 in aio_bh_poll (ctx=ctx@entry=0x55e7458067a0) at util/async.c:117 + #21 0x000055e743388c54 in aio_dispatch (ctx=0x55e7458067a0) at util/aio-posix.c:459 + #22 0x000055e743385742 in aio_ctx_dispatch + (source=, callback=, user_data=) at util/async.c:260 + #23 0x00007fb68543e67d in g_main_dispatch (context=0x55e745893a40) at gmain.c:3176 + #24 0x00007fb68543e67d in g_main_context_dispatch (context=context@entry=0x55e745893a40) at gmain.c:3829 + #25 0x000055e743387d08 in glib_pollfds_poll () at util/main-loop.c:219 + #26 0x000055e743387d08 in os_host_main_loop_wait (timeout=) at util/main-loop.c:242 + #27 0x000055e743387d08 in main_loop_wait (nonblocking=) at util/main-loop.c:518 + #28 0x000055e74316a3c1 in main_loop () at vl.c:1828 + #29 0x000055e743016a72 in main (argc=, argv=, envp=) + at vl.c:4504 + +Fix this by not acquiring the AioContext there, and ensuring all paths +leading to it have it already acquired (backup_clean()). + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1782111 +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 0abf2581717a19d9749d5c2ff8acd0ac203452c2) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + block/backup-top.c | 5 ----- + block/backup.c | 3 +++ + 2 files changed, 3 insertions(+), 5 deletions(-) + +diff --git a/block/backup-top.c b/block/backup-top.c +index 818d3f2..b8d863f 100644 +--- a/block/backup-top.c ++++ b/block/backup-top.c +@@ -255,9 +255,6 @@ append_failed: + void bdrv_backup_top_drop(BlockDriverState *bs) + { + BDRVBackupTopState *s = bs->opaque; +- AioContext *aio_context = bdrv_get_aio_context(bs); +- +- aio_context_acquire(aio_context); + + bdrv_drained_begin(bs); + +@@ -271,6 +268,4 @@ void bdrv_backup_top_drop(BlockDriverState *bs) + bdrv_drained_end(bs); + + bdrv_unref(bs); +- +- aio_context_release(aio_context); + } +diff --git a/block/backup.c b/block/backup.c +index cf62b1a..1383e21 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -135,8 +135,11 @@ static void backup_abort(Job *job) + static void backup_clean(Job *job) + { + BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); ++ AioContext *aio_context = bdrv_get_aio_context(s->backup_top); + ++ aio_context_acquire(aio_context); + bdrv_backup_top_drop(s->backup_top); ++ aio_context_release(aio_context); + } + + void backup_do_checkpoint(BlockJob *job, Error **errp) +-- +1.8.3.1 + diff --git a/kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch b/kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch new file mode 100755 index 0000000..745be9f --- /dev/null +++ b/kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch @@ -0,0 +1,114 @@ +From 1e0582ad34e77a060e2067a35992979c9eae82c9 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:31 +0000 +Subject: [PATCH 11/20] block: bdrv_reopen() with backing file in different + AioContext + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-6-kwolf@redhat.com> +Patchwork-id: 94282 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 05/13] block: bdrv_reopen() with backing file in different AioContext +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +This patch allows bdrv_reopen() (and therefore the x-blockdev-reopen QMP +command) to attach a node as the new backing file even if the node is in +a different AioContext than the parent if one of both nodes can be moved +to the AioContext of the other node. + +Signed-off-by: Kevin Wolf +Tested-by: Peter Krempa +Message-Id: <20200306141413.30705-3-kwolf@redhat.com> +Reviewed-by: Alberto Garcia +Signed-off-by: Kevin Wolf +(cherry picked from commit 1de6b45fb5c1489b450df7d1a4c692bba9678ce6) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 32 ++++++++++++++++++++++++++------ + tests/qemu-iotests/245 | 8 +++----- + 2 files changed, 29 insertions(+), 11 deletions(-) + +diff --git a/block.c b/block.c +index a744bb5..39e4647 100644 +--- a/block.c ++++ b/block.c +@@ -3749,6 +3749,29 @@ static void bdrv_reopen_perm(BlockReopenQueue *q, BlockDriverState *bs, + *shared = cumulative_shared_perms; + } + ++static bool bdrv_reopen_can_attach(BlockDriverState *parent, ++ BdrvChild *child, ++ BlockDriverState *new_child, ++ Error **errp) ++{ ++ AioContext *parent_ctx = bdrv_get_aio_context(parent); ++ AioContext *child_ctx = bdrv_get_aio_context(new_child); ++ GSList *ignore; ++ bool ret; ++ ++ ignore = g_slist_prepend(NULL, child); ++ ret = bdrv_can_set_aio_context(new_child, parent_ctx, &ignore, NULL); ++ g_slist_free(ignore); ++ if (ret) { ++ return ret; ++ } ++ ++ ignore = g_slist_prepend(NULL, child); ++ ret = bdrv_can_set_aio_context(parent, child_ctx, &ignore, errp); ++ g_slist_free(ignore); ++ return ret; ++} ++ + /* + * Take a BDRVReopenState and check if the value of 'backing' in the + * reopen_state->options QDict is valid or not. +@@ -3800,14 +3823,11 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, + } + + /* +- * TODO: before removing the x- prefix from x-blockdev-reopen we +- * should move the new backing file into the right AioContext +- * instead of returning an error. ++ * Check AioContext compatibility so that the bdrv_set_backing_hd() call in ++ * bdrv_reopen_commit() won't fail. + */ + if (new_backing_bs) { +- if (bdrv_get_aio_context(new_backing_bs) != bdrv_get_aio_context(bs)) { +- error_setg(errp, "Cannot use a new backing file " +- "with a different AioContext"); ++ if (!bdrv_reopen_can_attach(bs, bs->backing, new_backing_bs, errp)) { + return -EINVAL; + } + } +diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 +index f69c2fa..919131d 100644 +--- a/tests/qemu-iotests/245 ++++ b/tests/qemu-iotests/245 +@@ -1013,18 +1013,16 @@ class TestBlockdevReopen(iotests.QMPTestCase): + # neither of them can switch to the other AioContext + def test_iothreads_error(self): + self.run_test_iothreads('iothread0', 'iothread1', +- "Cannot use a new backing file with a different AioContext") ++ "Cannot change iothread of active block backend") + + def test_iothreads_compatible_users(self): + self.run_test_iothreads('iothread0', 'iothread0') + + def test_iothreads_switch_backing(self): +- self.run_test_iothreads('iothread0', None, +- "Cannot use a new backing file with a different AioContext") ++ self.run_test_iothreads('iothread0', None) + + def test_iothreads_switch_overlay(self): +- self.run_test_iothreads(None, 'iothread0', +- "Cannot use a new backing file with a different AioContext") ++ self.run_test_iothreads(None, 'iothread0') + + if __name__ == '__main__': + iotests.main(supported_fmts=["qcow2"], +-- +1.8.3.1 + diff --git a/kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch b/kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch new file mode 100755 index 0000000..a974a18 --- /dev/null +++ b/kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch @@ -0,0 +1,55 @@ +From 5e5ca17e1e09cfe9a780c556528bbde23c93fc4e Mon Sep 17 00:00:00 2001 +From: Richard Jones +Date: Thu, 28 May 2020 14:27:37 +0100 +Subject: [PATCH 03/26] block/curl: HTTP header field names are case + insensitive +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Richard Jones +Message-id: <20200528142737.17318-3-rjones@redhat.com> +Patchwork-id: 96895 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] block/curl: HTTP header field names are case insensitive +Bugzilla: 1841038 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Philippe Mathieu-Daudé + +From: David Edmondson + +RFC 7230 section 3.2 indicates that HTTP header field names are case +insensitive. + +Signed-off-by: David Edmondson +Message-Id: <20200224101310.101169-3-david.edmondson@oracle.com> +Reviewed-by: Max Reitz +Signed-off-by: Max Reitz +(cherry picked from commit 69032253c33ae1774233c63cedf36d32242a85fc) +Signed-off-by: Danilo C. L. de Paula +--- + block/curl.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/block/curl.c b/block/curl.c +index f9ffb7f..6e32590 100644 +--- a/block/curl.c ++++ b/block/curl.c +@@ -216,11 +216,12 @@ static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) + size_t realsize = size * nmemb; + const char *header = (char *)ptr; + const char *end = header + realsize; +- const char *accept_ranges = "Accept-Ranges:"; ++ const char *accept_ranges = "accept-ranges:"; + const char *bytes = "bytes"; + + if (realsize >= strlen(accept_ranges) +- && strncmp(header, accept_ranges, strlen(accept_ranges)) == 0) { ++ && g_ascii_strncasecmp(header, accept_ranges, ++ strlen(accept_ranges)) == 0) { + + char *p = strchr(header, ':') + 1; + +-- +1.8.3.1 + diff --git a/kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch b/kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch new file mode 100755 index 0000000..c09a1e2 --- /dev/null +++ b/kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch @@ -0,0 +1,76 @@ +From e5ac775de83d3d22f13c74ab198780b8b579f684 Mon Sep 17 00:00:00 2001 +From: Richard Jones +Date: Thu, 28 May 2020 14:27:36 +0100 +Subject: [PATCH 02/26] block/curl: HTTP header fields allow whitespace around + values + +RH-Author: Richard Jones +Message-id: <20200528142737.17318-2-rjones@redhat.com> +Patchwork-id: 96894 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] block/curl: HTTP header fields allow whitespace around values +Bugzilla: 1841038 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Danilo de Paula + +From: David Edmondson + +RFC 7230 section 3.2 indicates that whitespace is permitted between +the field name and field value and after the field value. + +Signed-off-by: David Edmondson +Message-Id: <20200224101310.101169-2-david.edmondson@oracle.com> +Reviewed-by: Max Reitz +Signed-off-by: Max Reitz +(cherry picked from commit 7788a319399f17476ff1dd43164c869e320820a2) +Signed-off-by: Danilo C. L. de Paula +--- + block/curl.c | 31 +++++++++++++++++++++++++++---- + 1 file changed, 27 insertions(+), 4 deletions(-) + +diff --git a/block/curl.c b/block/curl.c +index f862993..f9ffb7f 100644 +--- a/block/curl.c ++++ b/block/curl.c +@@ -214,11 +214,34 @@ static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) + { + BDRVCURLState *s = opaque; + size_t realsize = size * nmemb; +- const char *accept_line = "Accept-Ranges: bytes"; ++ const char *header = (char *)ptr; ++ const char *end = header + realsize; ++ const char *accept_ranges = "Accept-Ranges:"; ++ const char *bytes = "bytes"; + +- if (realsize >= strlen(accept_line) +- && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) { +- s->accept_range = true; ++ if (realsize >= strlen(accept_ranges) ++ && strncmp(header, accept_ranges, strlen(accept_ranges)) == 0) { ++ ++ char *p = strchr(header, ':') + 1; ++ ++ /* Skip whitespace between the header name and value. */ ++ while (p < end && *p && g_ascii_isspace(*p)) { ++ p++; ++ } ++ ++ if (end - p >= strlen(bytes) ++ && strncmp(p, bytes, strlen(bytes)) == 0) { ++ ++ /* Check that there is nothing but whitespace after the value. */ ++ p += strlen(bytes); ++ while (p < end && *p && g_ascii_isspace(*p)) { ++ p++; ++ } ++ ++ if (p == end || !*p) { ++ s->accept_range = true; ++ } ++ } + } + + return realsize; +-- +1.8.3.1 + diff --git a/kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch b/kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch new file mode 100755 index 0000000..60b1b0a --- /dev/null +++ b/kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch @@ -0,0 +1,76 @@ +From 8c339c3535728179acc94deb5b922aebcfac9ab6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 3 Jun 2021 16:13:34 -0400 +Subject: [PATCH 2/4] block/file-posix: Fix problem with fallocate(PUNCH_HOLE) + on GPFS + +RH-Author: Thomas Huth +Message-id: <20210603161334.607005-2-thuth@redhat.com> +Patchwork-id: 101673 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] block/file-posix: Fix problem with fallocate(PUNCH_HOLE) on GPFS +Bugzilla: 1944861 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Max Reitz +RH-Acked-by: Cornelia Huck +RH-Acked-by: Laszlo Ersek + +A customer reported that running + + qemu-img convert -t none -O qcow2 -f qcow2 input.qcow2 output.qcow2 + +fails for them with the following error message when the images are +stored on a GPFS file system : + + qemu-img: error while writing sector 0: Invalid argument + +After analyzing the strace output, it seems like the problem is in +handle_aiocb_write_zeroes(): The call to fallocate(FALLOC_FL_PUNCH_HOLE) +returns EINVAL, which can apparently happen if the file system has +a different idea of the granularity of the operation. It's arguably +a bug in GPFS, since the PUNCH_HOLE mode should not result in EINVAL +according to the man-page of fallocate(), but the file system is out +there in production and so we have to deal with it. In commit 294682cc3a +("block: workaround for unaligned byte range in fallocate()") we also +already applied the a work-around for the same problem to the earlier +fallocate(FALLOC_FL_ZERO_RANGE) call, so do it now similar with the +PUNCH_HOLE call. But instead of silently catching and returning +-ENOTSUP (which causes the caller to fall back to writing zeroes), +let's rather inform the user once about the buggy file system and +try the other fallback instead. + +Signed-off-by: Thomas Huth +Message-Id: <20210527172020.847617-2-thuth@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 73ebf29729d1a40feaa9f8ab8951b6ee6dbfbede) +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1944861 +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 62a463229f..371572f1b0 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1587,6 +1587,17 @@ static int handle_aiocb_write_zeroes(void *opaque) + return ret; + } + s->has_fallocate = false; ++ } else if (ret == -EINVAL) { ++ /* ++ * Some file systems like older versions of GPFS do not like un- ++ * aligned byte ranges, and return EINVAL in such a case, though ++ * they should not do it according to the man-page of fallocate(). ++ * Warn about the bad filesystem and try the final fallback instead. ++ */ ++ warn_report_once("Your file system is misbehaving: " ++ "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. " ++ "Please report this bug to your file sytem " ++ "vendor."); + } else if (ret != -ENOTSUP) { + return ret; + } else { +-- +2.27.0 + diff --git a/kvm-block-introducing-bdrv_co_delete_file-interface.patch b/kvm-block-introducing-bdrv_co_delete_file-interface.patch new file mode 100755 index 0000000..9d5e659 --- /dev/null +++ b/kvm-block-introducing-bdrv_co_delete_file-interface.patch @@ -0,0 +1,99 @@ +From 9581770f48911cbe68cfa1a7fa125df2a0a27d02 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Sun, 31 May 2020 16:40:33 +0100 +Subject: [PATCH 5/7] block: introducing 'bdrv_co_delete_file' interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Maxim Levitsky +Message-id: <20200531164035.34188-2-mlevitsk@redhat.com> +Patchwork-id: 97057 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/3] block: introducing 'bdrv_co_delete_file' interface +Bugzilla: 1827630 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: John Snow +RH-Acked-by: Eric Blake + +From: Daniel Henrique Barboza + +Adding to Block Drivers the capability of being able to clean up +its created files can be useful in certain situations. For the +LUKS driver, for instance, a failure in one of its authentication +steps can leave files in the host that weren't there before. + +This patch adds the 'bdrv_co_delete_file' interface to block +drivers and add it to the 'file' driver in file-posix.c. The +implementation is given by 'raw_co_delete_file'. + +Suggested-by: Daniel P. Berrangé +Signed-off-by: Daniel Henrique Barboza +Message-Id: <20200130213907.2830642-2-danielhb413@gmail.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 9bffae14df879255329473a7bd578643af2d4c9c) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 23 +++++++++++++++++++++++ + include/block/block_int.h | 4 ++++ + 2 files changed, 27 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index dd18d40..1609598 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2388,6 +2388,28 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + return raw_co_create(&options, errp); + } + ++static int coroutine_fn raw_co_delete_file(BlockDriverState *bs, ++ Error **errp) ++{ ++ struct stat st; ++ int ret; ++ ++ if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) { ++ error_setg_errno(errp, ENOENT, "%s is not a regular file", ++ bs->filename); ++ return -ENOENT; ++ } ++ ++ ret = unlink(bs->filename); ++ if (ret < 0) { ++ ret = -errno; ++ error_setg_errno(errp, -ret, "Error when deleting file %s", ++ bs->filename); ++ } ++ ++ return ret; ++} ++ + /* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. +@@ -3019,6 +3041,7 @@ BlockDriver bdrv_file = { + .bdrv_co_block_status = raw_co_block_status, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, ++ .bdrv_co_delete_file = raw_co_delete_file, + + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 529f153..562dca1 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -316,6 +316,10 @@ struct BlockDriver { + */ + int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); + ++ /* Delete a created file. */ ++ int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs, ++ Error **errp); ++ + /* + * Flushes all data that was already written to the OS all the way down to + * the disk (for example file-posix.c calls fsync()). +-- +1.8.3.1 + diff --git a/kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch b/kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch new file mode 100755 index 0000000..fe8c49b --- /dev/null +++ b/kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch @@ -0,0 +1,100 @@ +From b9b77159567283628645943b5367d39b558e8faa Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 26 Jan 2021 20:07:59 -0500 +Subject: [PATCH 9/9] block/iscsi:fix heap-buffer-overflow in + iscsi_aio_ioctl_cb +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210126200759.245891-2-jmaloy@redhat.com> +Patchwork-id: 100787 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] block/iscsi:fix heap-buffer-overflow in iscsi_aio_ioctl_cb +Bugzilla: 1912974 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Acked-by: Laszlo Ersek + +From: Chen Qun + +There is an overflow, the source 'datain.data[2]' is 100 bytes, + but the 'ss' is 252 bytes.This may cause a security issue because + we can access a lot of unrelated memory data. + +The len for sbp copy data should take the minimum of mx_sb_len and + sb_len_wr, not the maximum. + +If we use iscsi device for VM backend storage, ASAN show stack: + +READ of size 252 at 0xfffd149dcfc4 thread T0 + #0 0xaaad433d0d34 in __asan_memcpy (aarch64-softmmu/qemu-system-aarch64+0x2cb0d34) + #1 0xaaad45f9d6d0 in iscsi_aio_ioctl_cb /qemu/block/iscsi.c:996:9 + #2 0xfffd1af0e2dc (/usr/lib64/iscsi/libiscsi.so.8+0xe2dc) + #3 0xfffd1af0d174 (/usr/lib64/iscsi/libiscsi.so.8+0xd174) + #4 0xfffd1af19fac (/usr/lib64/iscsi/libiscsi.so.8+0x19fac) + #5 0xaaad45f9acc8 in iscsi_process_read /qemu/block/iscsi.c:403:5 + #6 0xaaad4623733c in aio_dispatch_handler /qemu/util/aio-posix.c:467:9 + #7 0xaaad4622f350 in aio_dispatch_handlers /qemu/util/aio-posix.c:510:20 + #8 0xaaad4622f350 in aio_dispatch /qemu/util/aio-posix.c:520 + #9 0xaaad46215944 in aio_ctx_dispatch /qemu/util/async.c:298:5 + #10 0xfffd1bed12f4 in g_main_context_dispatch (/lib64/libglib-2.0.so.0+0x512f4) + #11 0xaaad46227de0 in glib_pollfds_poll /qemu/util/main-loop.c:219:9 + #12 0xaaad46227de0 in os_host_main_loop_wait /qemu/util/main-loop.c:242 + #13 0xaaad46227de0 in main_loop_wait /qemu/util/main-loop.c:518 + #14 0xaaad43d9d60c in qemu_main_loop /qemu/softmmu/vl.c:1662:9 + #15 0xaaad4607a5b0 in main /qemu/softmmu/main.c:49:5 + #16 0xfffd1a460b9c in __libc_start_main (/lib64/libc.so.6+0x20b9c) + #17 0xaaad43320740 in _start (aarch64-softmmu/qemu-system-aarch64+0x2c00740) + +0xfffd149dcfc4 is located 0 bytes to the right of 100-byte region [0xfffd149dcf60,0xfffd149dcfc4) +allocated by thread T0 here: + #0 0xaaad433d1e70 in __interceptor_malloc (aarch64-softmmu/qemu-system-aarch64+0x2cb1e70) + #1 0xfffd1af0e254 (/usr/lib64/iscsi/libiscsi.so.8+0xe254) + #2 0xfffd1af0d174 (/usr/lib64/iscsi/libiscsi.so.8+0xd174) + #3 0xfffd1af19fac (/usr/lib64/iscsi/libiscsi.so.8+0x19fac) + #4 0xaaad45f9acc8 in iscsi_process_read /qemu/block/iscsi.c:403:5 + #5 0xaaad4623733c in aio_dispatch_handler /qemu/util/aio-posix.c:467:9 + #6 0xaaad4622f350 in aio_dispatch_handlers /qemu/util/aio-posix.c:510:20 + #7 0xaaad4622f350 in aio_dispatch /qemu/util/aio-posix.c:520 + #8 0xaaad46215944 in aio_ctx_dispatch /qemu/util/async.c:298:5 + #9 0xfffd1bed12f4 in g_main_context_dispatch (/lib64/libglib-2.0.so.0+0x512f4) + #10 0xaaad46227de0 in glib_pollfds_poll /qemu/util/main-loop.c:219:9 + #11 0xaaad46227de0 in os_host_main_loop_wait /qemu/util/main-loop.c:242 + #12 0xaaad46227de0 in main_loop_wait /qemu/util/main-loop.c:518 + #13 0xaaad43d9d60c in qemu_main_loop /qemu/softmmu/vl.c:1662:9 + #14 0xaaad4607a5b0 in main /qemu/softmmu/main.c:49:5 + #15 0xfffd1a460b9c in __libc_start_main (/lib64/libc.so.6+0x20b9c) + #16 0xaaad43320740 in _start (aarch64-softmmu/qemu-system-aarch64+0x2c00740) + +Reported-by: Euler Robot +Signed-off-by: Chen Qun +Reviewed-by: Stefan Hajnoczi +Message-id: 20200418062602.10776-1-kuhn.chenqun@huawei.com +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Peter Maydell + +(cherry picked from ff0507c239a246fd7215b31c5658fc6a3ee1e4c5) +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + block/iscsi.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/block/iscsi.c b/block/iscsi.c +index 0bea2d3a93..06915655b3 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -991,8 +991,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, + acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; + + acb->ioh->sb_len_wr = acb->task->datain.size - 2; +- ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? +- acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; ++ ss = MIN(acb->ioh->mx_sb_len, acb->ioh->sb_len_wr); + memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); + } + +-- +2.18.2 + diff --git a/kvm-block-nbd-Fix-hang-in-.bdrv_close.patch b/kvm-block-nbd-Fix-hang-in-.bdrv_close.patch new file mode 100755 index 0000000..378ae1a --- /dev/null +++ b/kvm-block-nbd-Fix-hang-in-.bdrv_close.patch @@ -0,0 +1,78 @@ +From 4ef2c464a54b0b618d933641ac0a7012e629fed9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:42 +0000 +Subject: [PATCH 01/20] block/nbd: Fix hang in .bdrv_close() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-2-mlevitsk@redhat.com> +Patchwork-id: 94224 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 1/6] block/nbd: Fix hang in .bdrv_close() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +When nbd_close() is called from a coroutine, the connection_co never +gets to run, and thus nbd_teardown_connection() hangs. + +This is because aio_co_enter() only puts the connection_co into the main +coroutine's wake-up queue, so this main coroutine needs to yield and +wait for connection_co to terminate. + +Suggested-by: Kevin Wolf +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-2-mreitz@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Maxim Levitsky +Signed-off-by: Max Reitz +(cherry picked from commit 78c81a3f108870d325b0a39d88711366afe6f703) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/nbd.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/block/nbd.c b/block/nbd.c +index 5f18f78..a73f0d9 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -70,6 +70,7 @@ typedef struct BDRVNBDState { + CoMutex send_mutex; + CoQueue free_sema; + Coroutine *connection_co; ++ Coroutine *teardown_co; + QemuCoSleepState *connection_co_sleep_ns_state; + bool drained; + bool wait_drained_end; +@@ -203,7 +204,15 @@ static void nbd_teardown_connection(BlockDriverState *bs) + qemu_co_sleep_wake(s->connection_co_sleep_ns_state); + } + } +- BDRV_POLL_WHILE(bs, s->connection_co); ++ if (qemu_in_coroutine()) { ++ s->teardown_co = qemu_coroutine_self(); ++ /* connection_co resumes us when it terminates */ ++ qemu_coroutine_yield(); ++ s->teardown_co = NULL; ++ } else { ++ BDRV_POLL_WHILE(bs, s->connection_co); ++ } ++ assert(!s->connection_co); + } + + static bool nbd_client_connecting(BDRVNBDState *s) +@@ -395,6 +404,9 @@ static coroutine_fn void nbd_connection_entry(void *opaque) + s->ioc = NULL; + } + ++ if (s->teardown_co) { ++ aio_co_wake(s->teardown_co); ++ } + aio_wait_kick(); + } + +-- +1.8.3.1 + diff --git a/kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch b/kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch new file mode 100755 index 0000000..43f9ffc --- /dev/null +++ b/kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch @@ -0,0 +1,328 @@ +From 25c528b30f8774f33e957d14060805398da524d9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 26 Mar 2020 20:23:06 +0000 +Subject: [PATCH 1/4] block: pass BlockDriver reference to the .bdrv_co_create + +RH-Author: Maxim Levitsky +Message-id: <20200326202307.9264-2-mlevitsk@redhat.com> +Patchwork-id: 94447 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create +Bugzilla: 1816007 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Kevin Wolf +RH-Acked-by: Max Reitz + +This will allow the reuse of a single generic .bdrv_co_create +implementation for several drivers. +No functional changes. + +Signed-off-by: Maxim Levitsky +Message-Id: <20200326011218.29230-2-mlevitsk@redhat.com> +Reviewed-by: Denis V. Lunev +Signed-off-by: Max Reitz +(cherry picked from commit b92902dfeaafbceaf744ab7473f2d070284f6172) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 3 ++- + block/crypto.c | 3 ++- + block/file-posix.c | 4 +++- + block/file-win32.c | 4 +++- + block/gluster.c | 3 ++- + block/nfs.c | 4 +++- + block/parallels.c | 3 ++- + block/qcow.c | 3 ++- + block/qcow2.c | 4 +++- + block/qed.c | 3 ++- + block/raw-format.c | 4 +++- + block/rbd.c | 3 ++- + block/sheepdog.c | 4 +++- + block/ssh.c | 4 +++- + block/vdi.c | 4 +++- + block/vhdx.c | 3 ++- + block/vmdk.c | 4 +++- + block/vpc.c | 6 ++++-- + include/block/block_int.h | 3 ++- + 19 files changed, 49 insertions(+), 20 deletions(-) + +diff --git a/block.c b/block.c +index ec29b1e..f9a1c5b 100644 +--- a/block.c ++++ b/block.c +@@ -482,7 +482,8 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque) + CreateCo *cco = opaque; + assert(cco->drv); + +- ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, &local_err); ++ ret = cco->drv->bdrv_co_create_opts(cco->drv, ++ cco->filename, cco->opts, &local_err); + error_propagate(&cco->err, local_err); + cco->ret = ret; + } +diff --git a/block/crypto.c b/block/crypto.c +index 2482383..970d463 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -539,7 +539,8 @@ fail: + return ret; + } + +-static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename, ++static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/file-posix.c b/block/file-posix.c +index fd29372..a2e0a74 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2346,7 +2346,9 @@ out: + return result; + } + +-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn raw_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions options; +diff --git a/block/file-win32.c b/block/file-win32.c +index 77e8ff7..1585983 100644 +--- a/block/file-win32.c ++++ b/block/file-win32.c +@@ -588,7 +588,9 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp) + return 0; + } + +-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn raw_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions options; +diff --git a/block/gluster.c b/block/gluster.c +index 4fa4a77..0aa1f2c 100644 +--- a/block/gluster.c ++++ b/block/gluster.c +@@ -1130,7 +1130,8 @@ out: + return ret; + } + +-static int coroutine_fn qemu_gluster_co_create_opts(const char *filename, ++static int coroutine_fn qemu_gluster_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/nfs.c b/block/nfs.c +index 9a6311e..cc2413d 100644 +--- a/block/nfs.c ++++ b/block/nfs.c +@@ -662,7 +662,9 @@ out: + return ret; + } + +-static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts *opts, ++static int coroutine_fn nfs_file_co_create_opts(BlockDriver *drv, ++ const char *url, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options; +diff --git a/block/parallels.c b/block/parallels.c +index 7a01997..6d4ed77 100644 +--- a/block/parallels.c ++++ b/block/parallels.c +@@ -609,7 +609,8 @@ exit: + goto out; + } + +-static int coroutine_fn parallels_co_create_opts(const char *filename, ++static int coroutine_fn parallels_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/qcow.c b/block/qcow.c +index fce8989..8973e4e 100644 +--- a/block/qcow.c ++++ b/block/qcow.c +@@ -934,7 +934,8 @@ exit: + return ret; + } + +-static int coroutine_fn qcow_co_create_opts(const char *filename, ++static int coroutine_fn qcow_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, Error **errp) + { + BlockdevCreateOptions *create_options = NULL; +diff --git a/block/qcow2.c b/block/qcow2.c +index 83b1fc0..71067c6 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3558,7 +3558,9 @@ out: + return ret; + } + +-static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options = NULL; +diff --git a/block/qed.c b/block/qed.c +index d8c4e5f..1af9b3c 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -720,7 +720,8 @@ out: + return ret; + } + +-static int coroutine_fn bdrv_qed_co_create_opts(const char *filename, ++static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/raw-format.c b/block/raw-format.c +index 3a76ec7..93b25e1 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -419,7 +419,9 @@ static int raw_has_zero_init_truncate(BlockDriverState *bs) + return bdrv_has_zero_init_truncate(bs->file->bs); + } + +-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn raw_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + return bdrv_create_file(filename, opts, errp); +diff --git a/block/rbd.c b/block/rbd.c +index 027cbcc..8847259 100644 +--- a/block/rbd.c ++++ b/block/rbd.c +@@ -425,7 +425,8 @@ static int qemu_rbd_co_create(BlockdevCreateOptions *options, Error **errp) + return qemu_rbd_do_create(options, NULL, NULL, errp); + } + +-static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, ++static int coroutine_fn qemu_rbd_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/sheepdog.c b/block/sheepdog.c +index cfa8433..a8a7e32 100644 +--- a/block/sheepdog.c ++++ b/block/sheepdog.c +@@ -2157,7 +2157,9 @@ out: + return ret; + } + +-static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn sd_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options = NULL; +diff --git a/block/ssh.c b/block/ssh.c +index b4375cf..84e9282 100644 +--- a/block/ssh.c ++++ b/block/ssh.c +@@ -963,7 +963,9 @@ fail: + return ret; + } + +-static int coroutine_fn ssh_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn ssh_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options; +diff --git a/block/vdi.c b/block/vdi.c +index 0142da7..e1a11f2 100644 +--- a/block/vdi.c ++++ b/block/vdi.c +@@ -896,7 +896,9 @@ static int coroutine_fn vdi_co_create(BlockdevCreateOptions *create_options, + return vdi_co_do_create(create_options, DEFAULT_CLUSTER_SIZE, errp); + } + +-static int coroutine_fn vdi_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn vdi_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + QDict *qdict = NULL; +diff --git a/block/vhdx.c b/block/vhdx.c +index f02d261..33e57cd 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -2046,7 +2046,8 @@ delete_and_exit: + return ret; + } + +-static int coroutine_fn vhdx_co_create_opts(const char *filename, ++static int coroutine_fn vhdx_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/vmdk.c b/block/vmdk.c +index 20e909d..eb726f2 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -2588,7 +2588,9 @@ exit: + return blk; + } + +-static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn vmdk_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + Error *local_err = NULL; +diff --git a/block/vpc.c b/block/vpc.c +index a655502..6df75e2 100644 +--- a/block/vpc.c ++++ b/block/vpc.c +@@ -1089,8 +1089,10 @@ out: + return ret; + } + +-static int coroutine_fn vpc_co_create_opts(const char *filename, +- QemuOpts *opts, Error **errp) ++static int coroutine_fn vpc_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, ++ Error **errp) + { + BlockdevCreateOptions *create_options = NULL; + QDict *qdict; +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 96e327b..7ff81be 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -136,7 +136,8 @@ struct BlockDriver { + void (*bdrv_close)(BlockDriverState *bs); + int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts, + Error **errp); +- int coroutine_fn (*bdrv_co_create_opts)(const char *filename, ++ int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp); + int (*bdrv_make_empty)(BlockDriverState *bs); +-- +1.8.3.1 + diff --git a/kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch b/kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch new file mode 100755 index 0000000..2c27fd2 --- /dev/null +++ b/kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch @@ -0,0 +1,78 @@ +From ec5408763c49cd0b63ee324bdc38a429ed1adeee Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:29 +0000 +Subject: [PATCH 09/20] block/qcow2: Move bitmap reopen into + bdrv_reopen_commit_post + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-4-kwolf@redhat.com> +Patchwork-id: 94280 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 03/13] block/qcow2: Move bitmap reopen into bdrv_reopen_commit_post +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Peter Krempa + +The bitmap code requires writing the 'file' child when the qcow2 driver +is reopened in read-write mode. + +If the 'file' child is being reopened due to a permissions change, the +modification is commited yet when qcow2_reopen_commit is called. This +means that any attempt to write the 'file' child will end with EBADFD +as the original fd was already closed. + +Moving bitmap reopening to the new callback which is called after +permission modifications are commited fixes this as the file descriptor +will be replaced with the correct one. + +The above problem manifests itself when reopening 'qcow2' format layer +which uses a 'file-posix' file child which was opened with the +'auto-read-only' property set. + +Signed-off-by: Peter Krempa +Message-Id: +Signed-off-by: Kevin Wolf +(cherry picked from commit 65eb7c85a3e62529e2bad782e94d5a7b11dd5a92) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/block/qcow2.c b/block/qcow2.c +index 7c18721..83b1fc0 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -1881,6 +1881,11 @@ fail: + static void qcow2_reopen_commit(BDRVReopenState *state) + { + qcow2_update_options_commit(state->bs, state->opaque); ++ g_free(state->opaque); ++} ++ ++static void qcow2_reopen_commit_post(BDRVReopenState *state) ++{ + if (state->flags & BDRV_O_RDWR) { + Error *local_err = NULL; + +@@ -1895,7 +1900,6 @@ static void qcow2_reopen_commit(BDRVReopenState *state) + bdrv_get_node_name(state->bs)); + } + } +- g_free(state->opaque); + } + + static void qcow2_reopen_abort(BDRVReopenState *state) +@@ -5492,6 +5496,7 @@ BlockDriver bdrv_qcow2 = { + .bdrv_close = qcow2_close, + .bdrv_reopen_prepare = qcow2_reopen_prepare, + .bdrv_reopen_commit = qcow2_reopen_commit, ++ .bdrv_reopen_commit_post = qcow2_reopen_commit_post, + .bdrv_reopen_abort = qcow2_reopen_abort, + .bdrv_join_options = qcow2_join_options, + .bdrv_child_perm = bdrv_format_default_perms, +-- +1.8.3.1 + diff --git a/kvm-block-trickle-down-the-fallback-image-creation-funct.patch b/kvm-block-trickle-down-the-fallback-image-creation-funct.patch new file mode 100755 index 0000000..5ba1521 --- /dev/null +++ b/kvm-block-trickle-down-the-fallback-image-creation-funct.patch @@ -0,0 +1,296 @@ +From a1f7b929ae1fe6fa424c520c3a5eb497333b0fd9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 26 Mar 2020 20:23:07 +0000 +Subject: [PATCH 2/4] block: trickle down the fallback image creation function + use to the block drivers + +RH-Author: Maxim Levitsky +Message-id: <20200326202307.9264-3-mlevitsk@redhat.com> +Patchwork-id: 94446 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers +Bugzilla: 1816007 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Kevin Wolf +RH-Acked-by: Max Reitz + +Instead of checking the .bdrv_co_create_opts to see if we need the +fallback, just implement the .bdrv_co_create_opts in the drivers that +need it. + +This way we don't break various places that need to know if the +underlying protocol/format really supports image creation, and this way +we still allow some drivers to not support image creation. + +Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007 + +Note that technically this driver reverts the image creation fallback +for the vxhs driver since I don't have a means to test it, and IMHO it +is better to leave it not supported as it was prior to generic image +creation patches. + +Also drop iscsi_create_opts which was left accidentally. + +Signed-off-by: Maxim Levitsky +Message-Id: <20200326011218.29230-3-mlevitsk@redhat.com> +Reviewed-by: Denis V. Lunev +[mreitz: Fixed alignment, and moved bdrv_co_create_opts_simple() and + bdrv_create_opts_simple from block.h into block_int.h] +Signed-off-by: Max Reitz +(cherry picked from commit 5a5e7f8cd86b7ced0732b1b6e28c82baa65b09c9) + +Contextual conflicts in block.c and include/block/block_int.h + +(conflict in block.c by default shows as functional but +with --diff-algorithm=patience it becomes a contextual conflict) + +... +001/2:[----] [--] 'block: pass BlockDriver reference to the .bdrv_co_create' +002/2:[0014] [FC] 'block: trickle down the fallback image creation function use to the block drivers' +... +002/2: 'meld <(git show 5a5e7f8^\!) <(git show 6d3bca5^\!)' + +So now running: +meld <(git show 5a5e7f8^\! --diff-algorithm=patience) <(git show 6d3bca5^\! --diff-algorithm=patience) + +shows no contextual conflicts +It is mostly due to missing commit f6dc1c31d3801dcbdf0c56574f9ff4f05180810c +Thanks to Max Reitz for helping me with this. + +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 35 ++++++++++++++++++++--------------- + block/file-posix.c | 7 ++++++- + block/iscsi.c | 16 ++++------------ + block/nbd.c | 6 ++++++ + block/nvme.c | 3 +++ + include/block/block.h | 1 + + include/block/block_int.h | 11 +++++++++++ + 7 files changed, 51 insertions(+), 28 deletions(-) + +diff --git a/block.c b/block.c +index f9a1c5b..ba3b40d7 100644 +--- a/block.c ++++ b/block.c +@@ -597,8 +597,15 @@ static int create_file_fallback_zero_first_sector(BlockBackend *blk, + return 0; + } + +-static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, +- QemuOpts *opts, Error **errp) ++/** ++ * Simple implementation of bdrv_co_create_opts for protocol drivers ++ * which only support creation via opening a file ++ * (usually existing raw storage device) ++ */ ++int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, ++ Error **errp) + { + BlockBackend *blk; + QDict *options; +@@ -662,11 +669,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) + return -ENOENT; + } + +- if (drv->bdrv_co_create_opts) { +- return bdrv_create(drv, filename, opts, errp); +- } else { +- return bdrv_create_file_fallback(filename, drv, opts, errp); +- } ++ return bdrv_create(drv, filename, opts, errp); + } + + /** +@@ -1543,9 +1546,9 @@ QemuOptsList bdrv_runtime_opts = { + }, + }; + +-static QemuOptsList fallback_create_opts = { +- .name = "fallback-create-opts", +- .head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head), ++QemuOptsList bdrv_create_opts_simple = { ++ .name = "simple-create-opts", ++ .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, +@@ -5910,13 +5913,15 @@ void bdrv_img_create(const char *filename, const char *fmt, + return; + } + +- create_opts = qemu_opts_append(create_opts, drv->create_opts); +- if (proto_drv->create_opts) { +- create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); +- } else { +- create_opts = qemu_opts_append(create_opts, &fallback_create_opts); ++ if (!proto_drv->create_opts) { ++ error_setg(errp, "Protocol driver '%s' does not support image creation", ++ proto_drv->format_name); ++ return; + } + ++ create_opts = qemu_opts_append(create_opts, drv->create_opts); ++ create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); ++ + /* Create parameter list with default values */ + opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); + qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort); +diff --git a/block/file-posix.c b/block/file-posix.c +index a2e0a74..dd18d40 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -3432,6 +3432,8 @@ static BlockDriver bdrv_host_device = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, +@@ -3558,10 +3560,11 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + +- + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, +@@ -3690,6 +3693,8 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + + .bdrv_co_preadv = raw_co_preadv, +diff --git a/block/iscsi.c b/block/iscsi.c +index b45da65..16b0716 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -2399,18 +2399,6 @@ out_unlock: + return r; + } + +-static QemuOptsList iscsi_create_opts = { +- .name = "iscsi-create-opts", +- .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), +- .desc = { +- { +- .name = BLOCK_OPT_SIZE, +- .type = QEMU_OPT_SIZE, +- .help = "Virtual disk size" +- }, +- { /* end of list */ } +- } +-}; + + static const char *const iscsi_strong_runtime_opts[] = { + "transport", +@@ -2434,6 +2422,8 @@ static BlockDriver bdrv_iscsi = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +@@ -2471,6 +2461,8 @@ static BlockDriver bdrv_iser = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +diff --git a/block/nbd.c b/block/nbd.c +index a73f0d9..927915d 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -2030,6 +2030,8 @@ static BlockDriver bdrv_nbd = { + .protocol_name = "nbd", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_file_open = nbd_open, + .bdrv_reopen_prepare = nbd_client_reopen_prepare, + .bdrv_co_preadv = nbd_client_co_preadv, +@@ -2055,6 +2057,8 @@ static BlockDriver bdrv_nbd_tcp = { + .protocol_name = "nbd+tcp", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_file_open = nbd_open, + .bdrv_reopen_prepare = nbd_client_reopen_prepare, + .bdrv_co_preadv = nbd_client_co_preadv, +@@ -2080,6 +2084,8 @@ static BlockDriver bdrv_nbd_unix = { + .protocol_name = "nbd+unix", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_file_open = nbd_open, + .bdrv_reopen_prepare = nbd_client_reopen_prepare, + .bdrv_co_preadv = nbd_client_co_preadv, +diff --git a/block/nvme.c b/block/nvme.c +index d41c4bd..7b7c0cc 100644 +--- a/block/nvme.c ++++ b/block/nvme.c +@@ -1333,6 +1333,9 @@ static BlockDriver bdrv_nvme = { + .protocol_name = "nvme", + .instance_size = sizeof(BDRVNVMeState), + ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, ++ + .bdrv_parse_filename = nvme_parse_filename, + .bdrv_file_open = nvme_file_open, + .bdrv_close = nvme_close, +diff --git a/include/block/block.h b/include/block/block.h +index 1df9848..92685d2 100644 +--- a/include/block/block.h ++++ b/include/block/block.h +@@ -293,6 +293,7 @@ BlockDriver *bdrv_find_format(const char *format_name); + int bdrv_create(BlockDriver *drv, const char* filename, + QemuOpts *opts, Error **errp); + int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); ++ + BlockDriverState *bdrv_new(void); + void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, + Error **errp); +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 7ff81be..529f153 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -1325,4 +1325,15 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, + + int refresh_total_sectors(BlockDriverState *bs, int64_t hint); + ++/** ++ * Simple implementation of bdrv_co_create_opts for protocol drivers ++ * which only support creation via opening a file ++ * (usually existing raw storage device) ++ */ ++int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, ++ Error **errp); ++extern QemuOptsList bdrv_create_opts_simple; ++ + #endif /* BLOCK_INT_H */ +-- +1.8.3.1 + diff --git a/kvm-block-truncate-Don-t-make-backing-file-data-visible.patch b/kvm-block-truncate-Don-t-make-backing-file-data-visible.patch new file mode 100755 index 0000000..114e1b7 --- /dev/null +++ b/kvm-block-truncate-Don-t-make-backing-file-data-visible.patch @@ -0,0 +1,94 @@ +From d84b9b93755ece6618ed98fa84386beeb1a0e40b Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:36 +0100 +Subject: [PATCH 08/17] block: truncate: Don't make backing file data visible + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-8-kwolf@redhat.com> +Patchwork-id: 97454 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 07/11] block: truncate: Don't make backing file data visible +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +When extending the size of an image that has a backing file larger than +its old size, make sure that the backing file data doesn't become +visible in the guest, but the added area is properly zeroed out. + +Consider the following scenario where the overlay is shorter than its +backing file: + + base.qcow2: AAAAAAAA + overlay.qcow2: BBBB + +When resizing (extending) overlay.qcow2, the new blocks should not stay +unallocated and make the additional As from base.qcow2 visible like +before this patch, but zeros should be read. + +A similar case happens with the various variants of a commit job when an +intermediate file is short (- for unallocated): + + base.qcow2: A-A-AAAA + mid.qcow2: BB-B + top.qcow2: C--C--C- + +After commit top.qcow2 to mid.qcow2, the following happens: + + mid.qcow2: CB-C00C0 (correct result) + mid.qcow2: CB-C--C- (before this fix) + +Without the fix, blocks that previously read as zeros on top.qcow2 +suddenly turn into A. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200424125448.63318-8-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 955c7d6687fefcd903900a1e597fcbc896c661cd) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/io.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/block/io.c b/block/io.c +index 3235ce5..6c70b56 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3370,6 +3370,31 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + goto out; + } + ++ /* ++ * If the image has a backing file that is large enough that it would ++ * provide data for the new area, we cannot leave it unallocated because ++ * then the backing file content would become visible. Instead, zero-fill ++ * the new area. ++ * ++ * Note that if the image has a backing file, but was opened without the ++ * backing file, taking care of keeping things consistent with that backing ++ * file is the user's responsibility. ++ */ ++ if (new_bytes && bs->backing) { ++ int64_t backing_len; ++ ++ backing_len = bdrv_getlength(backing_bs(bs)); ++ if (backing_len < 0) { ++ ret = backing_len; ++ error_setg_errno(errp, -ret, "Could not get backing file size"); ++ goto out; ++ } ++ ++ if (backing_len > old_size) { ++ flags |= BDRV_REQ_ZERO_WRITE; ++ } ++ } ++ + if (drv->bdrv_co_truncate) { + if (flags & ~bs->supported_truncate_flags) { + error_setg(errp, "Block driver does not support requested flags"); +-- +1.8.3.1 + diff --git a/kvm-block.c-adding-bdrv_co_delete_file.patch b/kvm-block.c-adding-bdrv_co_delete_file.patch new file mode 100755 index 0000000..91c3cd1 --- /dev/null +++ b/kvm-block.c-adding-bdrv_co_delete_file.patch @@ -0,0 +1,92 @@ +From 23b92512d7f11b3a38cf24a5c2fe7848f1e550e8 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Sun, 31 May 2020 16:40:34 +0100 +Subject: [PATCH 6/7] block.c: adding bdrv_co_delete_file +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Maxim Levitsky +Message-id: <20200531164035.34188-3-mlevitsk@redhat.com> +Patchwork-id: 97058 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/3] block.c: adding bdrv_co_delete_file +Bugzilla: 1827630 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: John Snow +RH-Acked-by: Eric Blake + +From: Daniel Henrique Barboza + +Using the new 'bdrv_co_delete_file' interface, a pure co_routine function +'bdrv_co_delete_file' inside block.c can can be used in a way similar of +the existing bdrv_create_file to to clean up a created file. + +We're creating a pure co_routine because the only caller of +'bdrv_co_delete_file' will be already in co_routine context, thus there +is no need to add all the machinery to check for qemu_in_coroutine() and +create a separated co_routine to do the job. + +Suggested-by: Daniel P. Berrangé +Signed-off-by: Daniel Henrique Barboza +Message-Id: <20200130213907.2830642-3-danielhb413@gmail.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit e1d7f8bb1ec0c6911dcea81641ce6139dbded02d) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 26 ++++++++++++++++++++++++++ + include/block/block.h | 1 + + 2 files changed, 27 insertions(+) + +diff --git a/block.c b/block.c +index ba3b40d7..d6a05da 100644 +--- a/block.c ++++ b/block.c +@@ -672,6 +672,32 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) + return bdrv_create(drv, filename, opts, errp); + } + ++int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp) ++{ ++ Error *local_err = NULL; ++ int ret; ++ ++ assert(bs != NULL); ++ ++ if (!bs->drv) { ++ error_setg(errp, "Block node '%s' is not opened", bs->filename); ++ return -ENOMEDIUM; ++ } ++ ++ if (!bs->drv->bdrv_co_delete_file) { ++ error_setg(errp, "Driver '%s' does not support image deletion", ++ bs->drv->format_name); ++ return -ENOTSUP; ++ } ++ ++ ret = bs->drv->bdrv_co_delete_file(bs, &local_err); ++ if (ret < 0) { ++ error_propagate(errp, local_err); ++ } ++ ++ return ret; ++} ++ + /** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz struct and return 0. +diff --git a/include/block/block.h b/include/block/block.h +index 92685d2..b2a3074 100644 +--- a/include/block/block.h ++++ b/include/block/block.h +@@ -373,6 +373,7 @@ bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, + int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, + Error **errp); + void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base); ++int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp); + + + typedef struct BdrvCheckResult { +-- +1.8.3.1 + diff --git a/kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch b/kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch new file mode 100755 index 0000000..9a69130 --- /dev/null +++ b/kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch @@ -0,0 +1,176 @@ +From dc2654f2319ad6c379e0ba10be143726c6f0e9e0 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:47 +0000 +Subject: [PATCH 14/18] blockdev: Acquire AioContext on dirty bitmap functions + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-8-slp@redhat.com> +Patchwork-id: 93760 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 7/9] blockdev: Acquire AioContext on dirty bitmap functions +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Dirty map addition and removal functions are not acquiring to BDS +AioContext, while they may call to code that expects it to be +acquired. + +This may trigger a crash with a stack trace like this one: + + #0 0x00007f0ef146370f in __GI_raise (sig=sig@entry=6) + at ../sysdeps/unix/sysv/linux/raise.c:50 + #1 0x00007f0ef144db25 in __GI_abort () at abort.c:79 + #2 0x0000565022294dce in error_exit + (err=, msg=msg@entry=0x56502243a730 <__func__.16350> "qemu_mutex_unlock_impl") at util/qemu-thread-posix.c:36 + #3 0x00005650222950ba in qemu_mutex_unlock_impl + (mutex=mutex@entry=0x5650244b0240, file=file@entry=0x565022439adf "util/async.c", line=line@entry=526) at util/qemu-thread-posix.c:108 + #4 0x0000565022290029 in aio_context_release + (ctx=ctx@entry=0x5650244b01e0) at util/async.c:526 + #5 0x000056502221cd08 in bdrv_can_store_new_dirty_bitmap + (bs=bs@entry=0x5650244dc820, name=name@entry=0x56502481d360 "bitmap1", granularity=granularity@entry=65536, errp=errp@entry=0x7fff22831718) + at block/dirty-bitmap.c:542 + #6 0x000056502206ae53 in qmp_block_dirty_bitmap_add + (errp=0x7fff22831718, disabled=false, has_disabled=, persistent=, has_persistent=true, granularity=65536, has_granularity=, name=0x56502481d360 "bitmap1", node=) at blockdev.c:2894 + #7 0x000056502206ae53 in qmp_block_dirty_bitmap_add + (node=, name=0x56502481d360 "bitmap1", has_granularity=, granularity=, has_persistent=true, persistent=, has_disabled=false, disabled=false, errp=0x7fff22831718) at blockdev.c:2856 + #8 0x00005650221847a3 in qmp_marshal_block_dirty_bitmap_add + (args=, ret=, errp=0x7fff22831798) + at qapi/qapi-commands-block-core.c:651 + #9 0x0000565022247e6c in do_qmp_dispatch + (errp=0x7fff22831790, allow_oob=, request=, cmds=0x565022b32d60 ) at qapi/qmp-dispatch.c:132 + #10 0x0000565022247e6c in qmp_dispatch + (cmds=0x565022b32d60 , request=, allow_oob=) at qapi/qmp-dispatch.c:175 + #11 0x0000565022166061 in monitor_qmp_dispatch + (mon=0x56502450faa0, req=) at monitor/qmp.c:145 + #12 0x00005650221666fa in monitor_qmp_bh_dispatcher + (data=) at monitor/qmp.c:234 + #13 0x000056502228f866 in aio_bh_call (bh=0x56502440eae0) + at util/async.c:117 + #14 0x000056502228f866 in aio_bh_poll (ctx=ctx@entry=0x56502440d7a0) + at util/async.c:117 + #15 0x0000565022292c54 in aio_dispatch (ctx=0x56502440d7a0) + at util/aio-posix.c:459 + #16 0x000056502228f742 in aio_ctx_dispatch + (source=, callback=, user_data=) at util/async.c:260 + #17 0x00007f0ef5ce667d in g_main_dispatch (context=0x56502449aa40) + at gmain.c:3176 + #18 0x00007f0ef5ce667d in g_main_context_dispatch + (context=context@entry=0x56502449aa40) at gmain.c:3829 + #19 0x0000565022291d08 in glib_pollfds_poll () at util/main-loop.c:219 + #20 0x0000565022291d08 in os_host_main_loop_wait + (timeout=) at util/main-loop.c:242 + #21 0x0000565022291d08 in main_loop_wait (nonblocking=) + at util/main-loop.c:518 + #22 0x00005650220743c1 in main_loop () at vl.c:1828 + #23 0x0000565021f20a72 in main + (argc=, argv=, envp=) + at vl.c:4504 + +Fix this by acquiring the AioContext at qmp_block_dirty_bitmap_add() +and qmp_block_dirty_bitmap_add(). + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1782175 +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 91005a495e228ebd7e5e173cd18f952450eef82d) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 22 ++++++++++++++++++---- + 1 file changed, 18 insertions(+), 4 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 1dacbc2..d4ef6cd 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -2984,6 +2984,7 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; + + if (!name || name[0] == '\0') { + error_setg(errp, "Bitmap name cannot be empty"); +@@ -2995,11 +2996,14 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + return; + } + ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ + if (has_granularity) { + if (granularity < 512 || !is_power_of_2(granularity)) { + error_setg(errp, "Granularity must be power of 2 " + "and at least 512"); +- return; ++ goto out; + } + } else { + /* Default to cluster size, if available: */ +@@ -3017,12 +3021,12 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + if (persistent && + !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp)) + { +- return; ++ goto out; + } + + bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp); + if (bitmap == NULL) { +- return; ++ goto out; + } + + if (disabled) { +@@ -3030,6 +3034,9 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + } + + bdrv_dirty_bitmap_set_persistence(bitmap, persistent); ++ ++out: ++ aio_context_release(aio_context); + } + + static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( +@@ -3038,21 +3045,27 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; + + bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); + if (!bitmap || !bs) { + return NULL; + } + ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ + if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO, + errp)) { ++ aio_context_release(aio_context); + return NULL; + } + + if (bdrv_dirty_bitmap_get_persistence(bitmap) && + bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0) + { +- return NULL; ++ aio_context_release(aio_context); ++ return NULL; + } + + if (release) { +@@ -3063,6 +3076,7 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( + *bitmap_bs = bs; + } + ++ aio_context_release(aio_context); + return release ? NULL : bitmap; + } + +-- +1.8.3.1 + diff --git a/kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch b/kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch new file mode 100755 index 0000000..8cb1700 --- /dev/null +++ b/kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch @@ -0,0 +1,179 @@ +From 0c8ba0a96a7d0cbf371f1a5fbee543e8b2cb2595 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:13 +0100 +Subject: [PATCH 08/26] blockdev: Promote several bitmap functions to + non-static +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-6-eblake@redhat.com> +Patchwork-id: 97077 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 05/12] blockdev: Promote several bitmap functions to non-static +Bugzilla: 1779893 1779904 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +The next patch will split blockdev.c, which will require accessing +some previously-static functions from more than one .c file. But part +of promoting a function to public is picking a naming scheme that does +not reek of exposing too many internals (two of the three functions +were named starting with 'do_'). To make future code motion easier, +perform the function rename and non-static promotion into its own +patch. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Message-Id: <20200513011648.166876-5-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit c6996cf9a6c759c29919642be9a73ac64b38301b) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 47 +++++++++++++++++++---------------------------- + include/block/block_int.h | 12 ++++++++++++ + 2 files changed, 31 insertions(+), 28 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 86eb115..3958058 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1262,10 +1262,10 @@ out_aio_context: + * + * @return: A bitmap object on success, or NULL on failure. + */ +-static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, +- const char *name, +- BlockDriverState **pbs, +- Error **errp) ++BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, ++ const char *name, ++ BlockDriverState **pbs, ++ Error **errp) + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; +@@ -2241,11 +2241,6 @@ static void block_dirty_bitmap_disable_abort(BlkActionState *common) + } + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_merge( +- const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bitmaps, +- HBitmap **backup, Error **errp); +- + static void block_dirty_bitmap_merge_prepare(BlkActionState *common, + Error **errp) + { +@@ -2259,15 +2254,11 @@ static void block_dirty_bitmap_merge_prepare(BlkActionState *common, + + action = common->action->u.block_dirty_bitmap_merge.data; + +- state->bitmap = do_block_dirty_bitmap_merge(action->node, action->target, +- action->bitmaps, &state->backup, +- errp); ++ state->bitmap = block_dirty_bitmap_merge(action->node, action->target, ++ action->bitmaps, &state->backup, ++ errp); + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( +- const char *node, const char *name, bool release, +- BlockDriverState **bitmap_bs, Error **errp); +- + static void block_dirty_bitmap_remove_prepare(BlkActionState *common, + Error **errp) + { +@@ -2281,8 +2272,8 @@ static void block_dirty_bitmap_remove_prepare(BlkActionState *common, + + action = common->action->u.block_dirty_bitmap_remove.data; + +- state->bitmap = do_block_dirty_bitmap_remove(action->node, action->name, +- false, &state->bs, errp); ++ state->bitmap = block_dirty_bitmap_remove(action->node, action->name, ++ false, &state->bs, errp); + if (state->bitmap) { + bdrv_dirty_bitmap_skip_store(state->bitmap, true); + bdrv_dirty_bitmap_set_busy(state->bitmap, true); +@@ -3046,9 +3037,10 @@ out: + aio_context_release(aio_context); + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( +- const char *node, const char *name, bool release, +- BlockDriverState **bitmap_bs, Error **errp) ++BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, ++ bool release, ++ BlockDriverState **bitmap_bs, ++ Error **errp) + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; +@@ -3090,7 +3082,7 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( + void qmp_block_dirty_bitmap_remove(const char *node, const char *name, + Error **errp) + { +- do_block_dirty_bitmap_remove(node, name, true, NULL, errp); ++ block_dirty_bitmap_remove(node, name, true, NULL, errp); + } + + /** +@@ -3151,10 +3143,9 @@ void qmp_block_dirty_bitmap_disable(const char *node, const char *name, + bdrv_disable_dirty_bitmap(bitmap); + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_merge( +- const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bitmaps, +- HBitmap **backup, Error **errp) ++BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bms, ++ HBitmap **backup, Error **errp) + { + BlockDriverState *bs; + BdrvDirtyBitmap *dst, *src, *anon; +@@ -3172,7 +3163,7 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_merge( + return NULL; + } + +- for (lst = bitmaps; lst; lst = lst->next) { ++ for (lst = bms; lst; lst = lst->next) { + switch (lst->value->type) { + const char *name, *node; + case QTYPE_QSTRING: +@@ -3217,7 +3208,7 @@ void qmp_block_dirty_bitmap_merge(const char *node, const char *target, + BlockDirtyBitmapMergeSourceList *bitmaps, + Error **errp) + { +- do_block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); ++ block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); + } + + BlockDirtyBitmapSha256 *qmp_x_debug_block_dirty_bitmap_sha256(const char *node, +diff --git a/include/block/block_int.h b/include/block/block_int.h +index cc18e8d..876a83d 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -1341,4 +1341,16 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, + Error **errp); + extern QemuOptsList bdrv_create_opts_simple; + ++BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, ++ const char *name, ++ BlockDriverState **pbs, ++ Error **errp); ++BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bms, ++ HBitmap **backup, Error **errp); ++BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, ++ bool release, ++ BlockDriverState **bitmap_bs, ++ Error **errp); ++ + #endif /* BLOCK_INT_H */ +-- +1.8.3.1 + diff --git a/kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch b/kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch new file mode 100755 index 0000000..b2dd453 --- /dev/null +++ b/kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch @@ -0,0 +1,107 @@ +From 24e5eca4218b294bd013e2d85a38345045506bec Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:48 +0000 +Subject: [PATCH 15/18] blockdev: Return bs to the proper context on snapshot + abort + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-9-slp@redhat.com> +Patchwork-id: 93761 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 8/9] blockdev: Return bs to the proper context on snapshot abort +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +external_snapshot_abort() calls to bdrv_set_backing_hd(), which +returns state->old_bs to the main AioContext, as it's intended to be +used then the BDS is going to be released. As that's not the case when +aborting an external snapshot, return it to the AioContext it was +before the call. + +This issue can be triggered by issuing a transaction with two actions, +a proper blockdev-snapshot-sync and a bogus one, so the second will +trigger a transaction abort. This results in a crash with an stack +trace like this one: + + #0 0x00007fa1048b28df in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 + #1 0x00007fa10489ccf5 in __GI_abort () at abort.c:79 + #2 0x00007fa10489cbc9 in __assert_fail_base + (fmt=0x7fa104a03300 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x5572240b44d8 "bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)", file=0x557224014d30 "block.c", line=2240, function=) at assert.c:92 + #3 0x00007fa1048aae96 in __GI___assert_fail + (assertion=assertion@entry=0x5572240b44d8 "bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)", file=file@entry=0x557224014d30 "block.c", line=line@entry=2240, function=function@entry=0x5572240b5d60 <__PRETTY_FUNCTION__.31620> "bdrv_replace_child_noperm") at assert.c:101 + #4 0x0000557223e631f8 in bdrv_replace_child_noperm (child=0x557225b9c980, new_bs=new_bs@entry=0x557225c42e40) at block.c:2240 + #5 0x0000557223e68be7 in bdrv_replace_node (from=0x557226951a60, to=0x557225c42e40, errp=0x5572247d6138 ) at block.c:4196 + #6 0x0000557223d069c4 in external_snapshot_abort (common=0x557225d7e170) at blockdev.c:1731 + #7 0x0000557223d069c4 in external_snapshot_abort (common=0x557225d7e170) at blockdev.c:1717 + #8 0x0000557223d09013 in qmp_transaction (dev_list=, has_props=, props=0x557225cc7d70, errp=errp@entry=0x7ffe704c0c98) at blockdev.c:2360 + #9 0x0000557223e32085 in qmp_marshal_transaction (args=, ret=, errp=0x7ffe704c0d08) at qapi/qapi-commands-transaction.c:44 + #10 0x0000557223ee798c in do_qmp_dispatch (errp=0x7ffe704c0d00, allow_oob=, request=, cmds=0x5572247d3cc0 ) at qapi/qmp-dispatch.c:132 + #11 0x0000557223ee798c in qmp_dispatch (cmds=0x5572247d3cc0 , request=, allow_oob=) at qapi/qmp-dispatch.c:175 + #12 0x0000557223e06141 in monitor_qmp_dispatch (mon=0x557225c69ff0, req=) at monitor/qmp.c:120 + #13 0x0000557223e0678a in monitor_qmp_bh_dispatcher (data=) at monitor/qmp.c:209 + #14 0x0000557223f2f366 in aio_bh_call (bh=0x557225b9dc60) at util/async.c:117 + #15 0x0000557223f2f366 in aio_bh_poll (ctx=ctx@entry=0x557225b9c840) at util/async.c:117 + #16 0x0000557223f32754 in aio_dispatch (ctx=0x557225b9c840) at util/aio-posix.c:459 + #17 0x0000557223f2f242 in aio_ctx_dispatch (source=, callback=, user_data=) at util/async.c:260 + #18 0x00007fa10913467d in g_main_dispatch (context=0x557225c28e80) at gmain.c:3176 + #19 0x00007fa10913467d in g_main_context_dispatch (context=context@entry=0x557225c28e80) at gmain.c:3829 + #20 0x0000557223f31808 in glib_pollfds_poll () at util/main-loop.c:219 + #21 0x0000557223f31808 in os_host_main_loop_wait (timeout=) at util/main-loop.c:242 + #22 0x0000557223f31808 in main_loop_wait (nonblocking=) at util/main-loop.c:518 + #23 0x0000557223d13201 in main_loop () at vl.c:1828 + #24 0x0000557223bbfb82 in main (argc=, argv=, envp=) at vl.c:4504 + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1779036 +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 377410f6fb4f6b0d26d4a028c20766fae05de17e) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/blockdev.c b/blockdev.c +index d4ef6cd..4cd9a58 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1731,6 +1731,8 @@ static void external_snapshot_abort(BlkActionState *common) + if (state->new_bs) { + if (state->overlay_appended) { + AioContext *aio_context; ++ AioContext *tmp_context; ++ int ret; + + aio_context = bdrv_get_aio_context(state->old_bs); + aio_context_acquire(aio_context); +@@ -1738,6 +1740,25 @@ static void external_snapshot_abort(BlkActionState *common) + bdrv_ref(state->old_bs); /* we can't let bdrv_set_backind_hd() + close state->old_bs; we need it */ + bdrv_set_backing_hd(state->new_bs, NULL, &error_abort); ++ ++ /* ++ * The call to bdrv_set_backing_hd() above returns state->old_bs to ++ * the main AioContext. As we're still going to be using it, return ++ * it to the AioContext it was before. ++ */ ++ tmp_context = bdrv_get_aio_context(state->old_bs); ++ if (aio_context != tmp_context) { ++ aio_context_release(aio_context); ++ aio_context_acquire(tmp_context); ++ ++ ret = bdrv_try_set_aio_context(state->old_bs, ++ aio_context, NULL); ++ assert(ret == 0); ++ ++ aio_context_release(tmp_context); ++ aio_context_acquire(aio_context); ++ } ++ + bdrv_replace_node(state->new_bs, state->old_bs, &error_abort); + bdrv_unref(state->old_bs); /* bdrv_replace_node() ref'ed old_bs */ + +-- +1.8.3.1 + diff --git a/kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch b/kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch new file mode 100755 index 0000000..d977922 --- /dev/null +++ b/kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch @@ -0,0 +1,720 @@ +From 2afa718d59ef86879a9e34b4601a1f2658afa9ba Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:14 +0100 +Subject: [PATCH 09/26] blockdev: Split off basic bitmap operations for + qemu-img + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-7-eblake@redhat.com> +Patchwork-id: 97073 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 06/12] blockdev: Split off basic bitmap operations for qemu-img +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Upcoming patches want to add some basic bitmap manipulation abilities +to qemu-img. But blockdev.o is too heavyweight to link into qemu-img +(among other things, it would drag in block jobs and transaction +support - qemu-img does offline manipulation, where atomicity is less +important because there are no concurrent modifications to compete +with), so it's time to split off the bare bones of what we will need +into a new file block/monitor/bitmap-qmp-cmds.o. + +This is sufficient to expose 6 QMP commands for use by qemu-img (add, +remove, clear, enable, disable, merge), as well as move the three +helper functions touched in the previous patch. Regarding +MAINTAINERS, the new file is automatically part of block core, but +also makes sense as related to other dirty bitmap files. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Message-Id: <20200513011648.166876-6-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit bb4e58c6137e80129b955789dd4b66c1504f20dc) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + Makefile.objs - comment context + block/monitor/Makefile.objs - context: a2dde2f2 not backported + blockdev.c - context +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 1 + + Makefile.objs | 3 +- + block/monitor/Makefile.objs | 1 + + block/monitor/bitmap-qmp-cmds.c | 321 ++++++++++++++++++++++++++++++++++++++++ + blockdev.c | 284 ----------------------------------- + 5 files changed, 324 insertions(+), 286 deletions(-) + create mode 100644 block/monitor/Makefile.objs + create mode 100644 block/monitor/bitmap-qmp-cmds.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 3a81ac9..49d5d44 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1875,6 +1875,7 @@ L: qemu-block@nongnu.org + S: Supported + F: include/qemu/hbitmap.h + F: include/block/dirty-bitmap.h ++F: block/monitor/bitmap-qmp-cmds.c + F: block/dirty-bitmap.c + F: block/qcow2-bitmap.c + F: migration/block-dirty-bitmap.c +diff --git a/Makefile.objs b/Makefile.objs +index 1a8f288..7404ef0 100644 +--- a/Makefile.objs ++++ b/Makefile.objs +@@ -13,9 +13,8 @@ authz-obj-y = authz/ + ####################################################################### + # block-obj-y is code used by both qemu system emulation and qemu-img + +-block-obj-y = nbd/ ++block-obj-y = block/ block/monitor/ nbd/ scsi/ + block-obj-y += block.o blockjob.o job.o +-block-obj-y += block/ scsi/ + block-obj-y += qemu-io-cmds.o + block-obj-$(CONFIG_REPLICATION) += replication.o + +diff --git a/block/monitor/Makefile.objs b/block/monitor/Makefile.objs +new file mode 100644 +index 0000000..f0c7642 +--- /dev/null ++++ b/block/monitor/Makefile.objs +@@ -0,0 +1 @@ ++block-obj-y += bitmap-qmp-cmds.o +diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c +new file mode 100644 +index 0000000..9f11dee +--- /dev/null ++++ b/block/monitor/bitmap-qmp-cmds.c +@@ -0,0 +1,321 @@ ++/* ++ * QEMU block dirty bitmap QMP commands ++ * ++ * Copyright (c) 2003-2008 Fabrice Bellard ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or ++ * later. See the COPYING file in the top-level directory. ++ * ++ * This file incorporates work covered by the following copyright and ++ * permission notice: ++ * ++ * Copyright (c) 2003-2008 Fabrice Bellard ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to deal ++ * in the Software without restriction, including without limitation the rights ++ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++ * copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++ * THE SOFTWARE. ++ */ ++ ++#include "qemu/osdep.h" ++ ++#include "block/block_int.h" ++#include "qapi/qapi-commands-block.h" ++#include "qapi/error.h" ++ ++/** ++ * block_dirty_bitmap_lookup: ++ * Return a dirty bitmap (if present), after validating ++ * the node reference and bitmap names. ++ * ++ * @node: The name of the BDS node to search for bitmaps ++ * @name: The name of the bitmap to search for ++ * @pbs: Output pointer for BDS lookup, if desired. Can be NULL. ++ * @errp: Output pointer for error information. Can be NULL. ++ * ++ * @return: A bitmap object on success, or NULL on failure. ++ */ ++BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, ++ const char *name, ++ BlockDriverState **pbs, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ ++ if (!node) { ++ error_setg(errp, "Node cannot be NULL"); ++ return NULL; ++ } ++ if (!name) { ++ error_setg(errp, "Bitmap name cannot be NULL"); ++ return NULL; ++ } ++ bs = bdrv_lookup_bs(node, node, NULL); ++ if (!bs) { ++ error_setg(errp, "Node '%s' not found", node); ++ return NULL; ++ } ++ ++ bitmap = bdrv_find_dirty_bitmap(bs, name); ++ if (!bitmap) { ++ error_setg(errp, "Dirty bitmap '%s' not found", name); ++ return NULL; ++ } ++ ++ if (pbs) { ++ *pbs = bs; ++ } ++ ++ return bitmap; ++} ++ ++void qmp_block_dirty_bitmap_add(const char *node, const char *name, ++ bool has_granularity, uint32_t granularity, ++ bool has_persistent, bool persistent, ++ bool has_disabled, bool disabled, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; ++ ++ if (!name || name[0] == '\0') { ++ error_setg(errp, "Bitmap name cannot be empty"); ++ return; ++ } ++ ++ bs = bdrv_lookup_bs(node, node, errp); ++ if (!bs) { ++ return; ++ } ++ ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ ++ if (has_granularity) { ++ if (granularity < 512 || !is_power_of_2(granularity)) { ++ error_setg(errp, "Granularity must be power of 2 " ++ "and at least 512"); ++ goto out; ++ } ++ } else { ++ /* Default to cluster size, if available: */ ++ granularity = bdrv_get_default_bitmap_granularity(bs); ++ } ++ ++ if (!has_persistent) { ++ persistent = false; ++ } ++ ++ if (!has_disabled) { ++ disabled = false; ++ } ++ ++ if (persistent && ++ !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp)) ++ { ++ goto out; ++ } ++ ++ bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp); ++ if (bitmap == NULL) { ++ goto out; ++ } ++ ++ if (disabled) { ++ bdrv_disable_dirty_bitmap(bitmap); ++ } ++ ++ bdrv_dirty_bitmap_set_persistence(bitmap, persistent); ++ ++out: ++ aio_context_release(aio_context); ++} ++ ++BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, ++ bool release, ++ BlockDriverState **bitmap_bs, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap || !bs) { ++ return NULL; ++ } ++ ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO, ++ errp)) { ++ aio_context_release(aio_context); ++ return NULL; ++ } ++ ++ if (bdrv_dirty_bitmap_get_persistence(bitmap) && ++ bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0) ++ { ++ aio_context_release(aio_context); ++ return NULL; ++ } ++ ++ if (release) { ++ bdrv_release_dirty_bitmap(bitmap); ++ } ++ ++ if (bitmap_bs) { ++ *bitmap_bs = bs; ++ } ++ ++ aio_context_release(aio_context); ++ return release ? NULL : bitmap; ++} ++ ++void qmp_block_dirty_bitmap_remove(const char *node, const char *name, ++ Error **errp) ++{ ++ block_dirty_bitmap_remove(node, name, true, NULL, errp); ++} ++ ++/** ++ * Completely clear a bitmap, for the purposes of synchronizing a bitmap ++ * immediately after a full backup operation. ++ */ ++void qmp_block_dirty_bitmap_clear(const char *node, const char *name, ++ Error **errp) ++{ ++ BdrvDirtyBitmap *bitmap; ++ BlockDriverState *bs; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap || !bs) { ++ return; ++ } ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_DEFAULT, errp)) { ++ return; ++ } ++ ++ bdrv_clear_dirty_bitmap(bitmap, NULL); ++} ++ ++void qmp_block_dirty_bitmap_enable(const char *node, const char *name, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap) { ++ return; ++ } ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { ++ return; ++ } ++ ++ bdrv_enable_dirty_bitmap(bitmap); ++} ++ ++void qmp_block_dirty_bitmap_disable(const char *node, const char *name, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap) { ++ return; ++ } ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { ++ return; ++ } ++ ++ bdrv_disable_dirty_bitmap(bitmap); ++} ++ ++BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bms, ++ HBitmap **backup, Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *dst, *src, *anon; ++ BlockDirtyBitmapMergeSourceList *lst; ++ Error *local_err = NULL; ++ ++ dst = block_dirty_bitmap_lookup(node, target, &bs, errp); ++ if (!dst) { ++ return NULL; ++ } ++ ++ anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst), ++ NULL, errp); ++ if (!anon) { ++ return NULL; ++ } ++ ++ for (lst = bms; lst; lst = lst->next) { ++ switch (lst->value->type) { ++ const char *name, *node; ++ case QTYPE_QSTRING: ++ name = lst->value->u.local; ++ src = bdrv_find_dirty_bitmap(bs, name); ++ if (!src) { ++ error_setg(errp, "Dirty bitmap '%s' not found", name); ++ dst = NULL; ++ goto out; ++ } ++ break; ++ case QTYPE_QDICT: ++ node = lst->value->u.external.node; ++ name = lst->value->u.external.name; ++ src = block_dirty_bitmap_lookup(node, name, NULL, errp); ++ if (!src) { ++ dst = NULL; ++ goto out; ++ } ++ break; ++ default: ++ abort(); ++ } ++ ++ bdrv_merge_dirty_bitmap(anon, src, NULL, &local_err); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ dst = NULL; ++ goto out; ++ } ++ } ++ ++ /* Merge into dst; dst is unchanged on failure. */ ++ bdrv_merge_dirty_bitmap(dst, anon, backup, errp); ++ ++ out: ++ bdrv_release_dirty_bitmap(anon); ++ return dst; ++} ++ ++void qmp_block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bitmaps, ++ Error **errp) ++{ ++ block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); ++} +diff --git a/blockdev.c b/blockdev.c +index 3958058..5128c9b 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1250,53 +1250,6 @@ out_aio_context: + return NULL; + } + +-/** +- * block_dirty_bitmap_lookup: +- * Return a dirty bitmap (if present), after validating +- * the node reference and bitmap names. +- * +- * @node: The name of the BDS node to search for bitmaps +- * @name: The name of the bitmap to search for +- * @pbs: Output pointer for BDS lookup, if desired. Can be NULL. +- * @errp: Output pointer for error information. Can be NULL. +- * +- * @return: A bitmap object on success, or NULL on failure. +- */ +-BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, +- const char *name, +- BlockDriverState **pbs, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- +- if (!node) { +- error_setg(errp, "Node cannot be NULL"); +- return NULL; +- } +- if (!name) { +- error_setg(errp, "Bitmap name cannot be NULL"); +- return NULL; +- } +- bs = bdrv_lookup_bs(node, node, NULL); +- if (!bs) { +- error_setg(errp, "Node '%s' not found", node); +- return NULL; +- } +- +- bitmap = bdrv_find_dirty_bitmap(bs, name); +- if (!bitmap) { +- error_setg(errp, "Dirty bitmap '%s' not found", name); +- return NULL; +- } +- +- if (pbs) { +- *pbs = bs; +- } +- +- return bitmap; +-} +- + /* New and old BlockDriverState structs for atomic group operations */ + + typedef struct BlkActionState BlkActionState; +@@ -2974,243 +2927,6 @@ out: + aio_context_release(aio_context); + } + +-void qmp_block_dirty_bitmap_add(const char *node, const char *name, +- bool has_granularity, uint32_t granularity, +- bool has_persistent, bool persistent, +- bool has_disabled, bool disabled, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- AioContext *aio_context; +- +- if (!name || name[0] == '\0') { +- error_setg(errp, "Bitmap name cannot be empty"); +- return; +- } +- +- bs = bdrv_lookup_bs(node, node, errp); +- if (!bs) { +- return; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- if (has_granularity) { +- if (granularity < 512 || !is_power_of_2(granularity)) { +- error_setg(errp, "Granularity must be power of 2 " +- "and at least 512"); +- goto out; +- } +- } else { +- /* Default to cluster size, if available: */ +- granularity = bdrv_get_default_bitmap_granularity(bs); +- } +- +- if (!has_persistent) { +- persistent = false; +- } +- +- if (!has_disabled) { +- disabled = false; +- } +- +- if (persistent && +- !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp)) +- { +- goto out; +- } +- +- bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp); +- if (bitmap == NULL) { +- goto out; +- } +- +- if (disabled) { +- bdrv_disable_dirty_bitmap(bitmap); +- } +- +- bdrv_dirty_bitmap_set_persistence(bitmap, persistent); +- +-out: +- aio_context_release(aio_context); +-} +- +-BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, +- bool release, +- BlockDriverState **bitmap_bs, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- AioContext *aio_context; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap || !bs) { +- return NULL; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO, +- errp)) { +- aio_context_release(aio_context); +- return NULL; +- } +- +- if (bdrv_dirty_bitmap_get_persistence(bitmap) && +- bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0) +- { +- aio_context_release(aio_context); +- return NULL; +- } +- +- if (release) { +- bdrv_release_dirty_bitmap(bitmap); +- } +- +- if (bitmap_bs) { +- *bitmap_bs = bs; +- } +- +- aio_context_release(aio_context); +- return release ? NULL : bitmap; +-} +- +-void qmp_block_dirty_bitmap_remove(const char *node, const char *name, +- Error **errp) +-{ +- block_dirty_bitmap_remove(node, name, true, NULL, errp); +-} +- +-/** +- * Completely clear a bitmap, for the purposes of synchronizing a bitmap +- * immediately after a full backup operation. +- */ +-void qmp_block_dirty_bitmap_clear(const char *node, const char *name, +- Error **errp) +-{ +- BdrvDirtyBitmap *bitmap; +- BlockDriverState *bs; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap || !bs) { +- return; +- } +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_DEFAULT, errp)) { +- return; +- } +- +- bdrv_clear_dirty_bitmap(bitmap, NULL); +-} +- +-void qmp_block_dirty_bitmap_enable(const char *node, const char *name, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap) { +- return; +- } +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { +- return; +- } +- +- bdrv_enable_dirty_bitmap(bitmap); +-} +- +-void qmp_block_dirty_bitmap_disable(const char *node, const char *name, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap) { +- return; +- } +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { +- return; +- } +- +- bdrv_disable_dirty_bitmap(bitmap); +-} +- +-BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bms, +- HBitmap **backup, Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *dst, *src, *anon; +- BlockDirtyBitmapMergeSourceList *lst; +- Error *local_err = NULL; +- +- dst = block_dirty_bitmap_lookup(node, target, &bs, errp); +- if (!dst) { +- return NULL; +- } +- +- anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst), +- NULL, errp); +- if (!anon) { +- return NULL; +- } +- +- for (lst = bms; lst; lst = lst->next) { +- switch (lst->value->type) { +- const char *name, *node; +- case QTYPE_QSTRING: +- name = lst->value->u.local; +- src = bdrv_find_dirty_bitmap(bs, name); +- if (!src) { +- error_setg(errp, "Dirty bitmap '%s' not found", name); +- dst = NULL; +- goto out; +- } +- break; +- case QTYPE_QDICT: +- node = lst->value->u.external.node; +- name = lst->value->u.external.name; +- src = block_dirty_bitmap_lookup(node, name, NULL, errp); +- if (!src) { +- dst = NULL; +- goto out; +- } +- break; +- default: +- abort(); +- } +- +- bdrv_merge_dirty_bitmap(anon, src, NULL, &local_err); +- if (local_err) { +- error_propagate(errp, local_err); +- dst = NULL; +- goto out; +- } +- } +- +- /* Merge into dst; dst is unchanged on failure. */ +- bdrv_merge_dirty_bitmap(dst, anon, backup, errp); +- +- out: +- bdrv_release_dirty_bitmap(anon); +- return dst; +-} +- +-void qmp_block_dirty_bitmap_merge(const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bitmaps, +- Error **errp) +-{ +- block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); +-} +- + BlockDirtyBitmapSha256 *qmp_x_debug_block_dirty_bitmap_sha256(const char *node, + const char *name, + Error **errp) +-- +1.8.3.1 + diff --git a/kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch b/kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch new file mode 100755 index 0000000..399a06a --- /dev/null +++ b/kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch @@ -0,0 +1,62 @@ +From d56b53cd75c4146eae7a06d1cc30ab823a9bde93 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:41 +0000 +Subject: [PATCH 08/18] blockdev: fix coding style issues in + drive_backup_prepare +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-2-slp@redhat.com> +Patchwork-id: 93754 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 1/9] blockdev: fix coding style issues in drive_backup_prepare +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Fix a couple of minor coding style issues in drive_backup_prepare. + +Signed-off-by: Sergio Lopez +Reviewed-by: Max Reitz +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 471ded690e19689018535e3f48480507ed073e22) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 8e029e9..553e315 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -3620,7 +3620,7 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, + + if (!backup->has_format) { + backup->format = backup->mode == NEW_IMAGE_MODE_EXISTING ? +- NULL : (char*) bs->drv->format_name; ++ NULL : (char *) bs->drv->format_name; + } + + /* Early check to avoid creating target */ +@@ -3630,8 +3630,10 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, + + flags = bs->open_flags | BDRV_O_RDWR; + +- /* See if we have a backing HD we can use to create our new image +- * on top of. */ ++ /* ++ * See if we have a backing HD we can use to create our new image ++ * on top of. ++ */ + if (backup->sync == MIRROR_SYNC_MODE_TOP) { + source = backing_bs(bs); + if (!source) { +-- +1.8.3.1 + diff --git a/kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch b/kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch new file mode 100755 index 0000000..a94ee75 --- /dev/null +++ b/kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch @@ -0,0 +1,204 @@ +From da4ee4c0d56200042cb86f8ccd2777009bd82df3 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:44 +0000 +Subject: [PATCH 11/18] blockdev: honor bdrv_try_set_aio_context() context + requirements + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-5-slp@redhat.com> +Patchwork-id: 93758 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 4/9] blockdev: honor bdrv_try_set_aio_context() context requirements +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +bdrv_try_set_aio_context() requires that the old context is held, and +the new context is not held. Fix all the occurrences where it's not +done this way. + +Suggested-by: Max Reitz +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 3ea67e08832775a28d0bd2795f01bc77e7ea1512) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 60 insertions(+), 8 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 152a0f7..1dacbc2 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1535,6 +1535,7 @@ static void external_snapshot_prepare(BlkActionState *common, + DO_UPCAST(ExternalSnapshotState, common, common); + TransactionAction *action = common->action; + AioContext *aio_context; ++ AioContext *old_context; + int ret; + + /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar +@@ -1675,7 +1676,16 @@ static void external_snapshot_prepare(BlkActionState *common, + goto out; + } + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(state->new_bs); ++ aio_context_release(aio_context); ++ aio_context_acquire(old_context); ++ + ret = bdrv_try_set_aio_context(state->new_bs, aio_context, errp); ++ ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + if (ret < 0) { + goto out; + } +@@ -1775,11 +1785,13 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) + BlockDriverState *target_bs; + BlockDriverState *source = NULL; + AioContext *aio_context; ++ AioContext *old_context; + QDict *options; + Error *local_err = NULL; + int flags; + int64_t size; + bool set_backing_hd = false; ++ int ret; + + assert(common->action->type == TRANSACTION_ACTION_KIND_DRIVE_BACKUP); + backup = common->action->u.drive_backup.data; +@@ -1868,6 +1880,21 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) + goto out; + } + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(target_bs); ++ aio_context_release(aio_context); ++ aio_context_acquire(old_context); ++ ++ ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); ++ if (ret < 0) { ++ bdrv_unref(target_bs); ++ aio_context_release(old_context); ++ return; ++ } ++ ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + if (set_backing_hd) { + bdrv_set_backing_hd(target_bs, source, &local_err); + if (local_err) { +@@ -1947,6 +1974,8 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + BlockDriverState *bs; + BlockDriverState *target_bs; + AioContext *aio_context; ++ AioContext *old_context; ++ int ret; + + assert(common->action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP); + backup = common->action->u.blockdev_backup.data; +@@ -1961,7 +1990,18 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + return; + } + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ + aio_context = bdrv_get_aio_context(bs); ++ old_context = bdrv_get_aio_context(target_bs); ++ aio_context_acquire(old_context); ++ ++ ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); ++ if (ret < 0) { ++ aio_context_release(old_context); ++ return; ++ } ++ ++ aio_context_release(old_context); + aio_context_acquire(aio_context); + state->bs = bs; + +@@ -3562,7 +3602,6 @@ static BlockJob *do_backup_common(BackupCommon *backup, + BlockJob *job = NULL; + BdrvDirtyBitmap *bmap = NULL; + int job_flags = JOB_DEFAULT; +- int ret; + + if (!backup->has_speed) { + backup->speed = 0; +@@ -3586,11 +3625,6 @@ static BlockJob *do_backup_common(BackupCommon *backup, + backup->compress = false; + } + +- ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); +- if (ret < 0) { +- return NULL; +- } +- + if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) || + (backup->sync == MIRROR_SYNC_MODE_INCREMENTAL)) { + /* done before desugaring 'incremental' to print the right message */ +@@ -3825,6 +3859,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + BlockDriverState *bs; + BlockDriverState *source, *target_bs; + AioContext *aio_context; ++ AioContext *old_context; + BlockMirrorBackingMode backing_mode; + Error *local_err = NULL; + QDict *options = NULL; +@@ -3937,12 +3972,22 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + (arg->mode == NEW_IMAGE_MODE_EXISTING || + !bdrv_has_zero_init(target_bs))); + ++ ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(target_bs); ++ aio_context_release(aio_context); ++ aio_context_acquire(old_context); ++ + ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); + if (ret < 0) { + bdrv_unref(target_bs); +- goto out; ++ aio_context_release(old_context); ++ return; + } + ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + blockdev_mirror_common(arg->has_job_id ? arg->job_id : NULL, bs, target_bs, + arg->has_replaces, arg->replaces, arg->sync, + backing_mode, zero_target, +@@ -3984,6 +4029,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id, + BlockDriverState *bs; + BlockDriverState *target_bs; + AioContext *aio_context; ++ AioContext *old_context; + BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN; + Error *local_err = NULL; + bool zero_target; +@@ -4001,10 +4047,16 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id, + + zero_target = (sync == MIRROR_SYNC_MODE_FULL); + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(target_bs); + aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); ++ aio_context_acquire(old_context); + + ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); ++ ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + if (ret < 0) { + goto out; + } +-- +1.8.3.1 + diff --git a/kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch b/kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch new file mode 100755 index 0000000..c426384 --- /dev/null +++ b/kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch @@ -0,0 +1,144 @@ +From 959955217f745f1ee6cbea97314efe69f2d7dc08 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:43 +0000 +Subject: [PATCH 10/18] blockdev: unify qmp_blockdev_backup and blockdev-backup + transaction paths + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-4-slp@redhat.com> +Patchwork-id: 93756 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 3/9] blockdev: unify qmp_blockdev_backup and blockdev-backup transaction paths +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Issuing a blockdev-backup from qmp_blockdev_backup takes a slightly +different path than when it's issued from a transaction. In the code, +this is manifested as some redundancy between do_blockdev_backup() and +blockdev_backup_prepare(). + +This change unifies both paths, merging do_blockdev_backup() and +blockdev_backup_prepare(), and changing qmp_blockdev_backup() to +create a transaction instead of calling do_backup_common() direcly. + +As a side-effect, now qmp_blockdev_backup() is executed inside a +drained section, as it happens when creating a blockdev-backup +transaction. This change is visible from the user's perspective, as +the job gets paused and immediately resumed before starting the actual +work. + +Signed-off-by: Sergio Lopez +Reviewed-by: Max Reitz +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 5b7bfe515ecbd584b40ff6e41d2fd8b37c7d5139) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 60 +++++++++++++----------------------------------------------- + 1 file changed, 13 insertions(+), 47 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 5e85fc0..152a0f7 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1940,16 +1940,13 @@ typedef struct BlockdevBackupState { + BlockJob *job; + } BlockdevBackupState; + +-static BlockJob *do_blockdev_backup(BlockdevBackup *backup, JobTxn *txn, +- Error **errp); +- + static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + { + BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common); + BlockdevBackup *backup; +- BlockDriverState *bs, *target; ++ BlockDriverState *bs; ++ BlockDriverState *target_bs; + AioContext *aio_context; +- Error *local_err = NULL; + + assert(common->action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP); + backup = common->action->u.blockdev_backup.data; +@@ -1959,8 +1956,8 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + return; + } + +- target = bdrv_lookup_bs(backup->target, backup->target, errp); +- if (!target) { ++ target_bs = bdrv_lookup_bs(backup->target, backup->target, errp); ++ if (!target_bs) { + return; + } + +@@ -1971,13 +1968,10 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + /* Paired with .clean() */ + bdrv_drained_begin(state->bs); + +- state->job = do_blockdev_backup(backup, common->block_job_txn, &local_err); +- if (local_err) { +- error_propagate(errp, local_err); +- goto out; +- } ++ state->job = do_backup_common(qapi_BlockdevBackup_base(backup), ++ bs, target_bs, aio_context, ++ common->block_job_txn, errp); + +-out: + aio_context_release(aio_context); + } + +@@ -3695,41 +3689,13 @@ XDbgBlockGraph *qmp_x_debug_query_block_graph(Error **errp) + return bdrv_get_xdbg_block_graph(errp); + } + +-BlockJob *do_blockdev_backup(BlockdevBackup *backup, JobTxn *txn, +- Error **errp) ++void qmp_blockdev_backup(BlockdevBackup *backup, Error **errp) + { +- BlockDriverState *bs; +- BlockDriverState *target_bs; +- AioContext *aio_context; +- BlockJob *job; +- +- bs = bdrv_lookup_bs(backup->device, backup->device, errp); +- if (!bs) { +- return NULL; +- } +- +- target_bs = bdrv_lookup_bs(backup->target, backup->target, errp); +- if (!target_bs) { +- return NULL; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- job = do_backup_common(qapi_BlockdevBackup_base(backup), +- bs, target_bs, aio_context, txn, errp); +- +- aio_context_release(aio_context); +- return job; +-} +- +-void qmp_blockdev_backup(BlockdevBackup *arg, Error **errp) +-{ +- BlockJob *job; +- job = do_blockdev_backup(arg, NULL, errp); +- if (job) { +- job_start(&job->job); +- } ++ TransactionAction action = { ++ .type = TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP, ++ .u.blockdev_backup.data = backup, ++ }; ++ blockdev_do_action(&action, errp); + } + + /* Parameter check and block job starting for drive mirroring. +-- +1.8.3.1 + diff --git a/kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch b/kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch new file mode 100755 index 0000000..9ec1975 --- /dev/null +++ b/kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch @@ -0,0 +1,419 @@ +From 4a03ab2a6cc4974d8d43240d1297b09160818af3 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:42 +0000 +Subject: [PATCH 09/18] blockdev: unify qmp_drive_backup and drive-backup + transaction paths + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-3-slp@redhat.com> +Patchwork-id: 93755 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 2/9] blockdev: unify qmp_drive_backup and drive-backup transaction paths +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Issuing a drive-backup from qmp_drive_backup takes a slightly +different path than when it's issued from a transaction. In the code, +this is manifested as some redundancy between do_drive_backup() and +drive_backup_prepare(). + +This change unifies both paths, merging do_drive_backup() and +drive_backup_prepare(), and changing qmp_drive_backup() to create a +transaction instead of calling do_backup_common() direcly. + +As a side-effect, now qmp_drive_backup() is executed inside a drained +section, as it happens when creating a drive-backup transaction. This +change is visible from the user's perspective, as the job gets paused +and immediately resumed before starting the actual work. + +Also fix tests 141, 185 and 219 to cope with the extra +JOB_STATUS_CHANGE lines. + +Signed-off-by: Sergio Lopez +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 2288ccfac96281c316db942d10e3f921c1373064) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 224 ++++++++++++++++++++------------------------- + tests/qemu-iotests/141.out | 2 + + tests/qemu-iotests/185.out | 2 + + tests/qemu-iotests/219 | 7 +- + tests/qemu-iotests/219.out | 8 ++ + 5 files changed, 117 insertions(+), 126 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 553e315..5e85fc0 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1761,39 +1761,128 @@ typedef struct DriveBackupState { + BlockJob *job; + } DriveBackupState; + +-static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, +- Error **errp); ++static BlockJob *do_backup_common(BackupCommon *backup, ++ BlockDriverState *bs, ++ BlockDriverState *target_bs, ++ AioContext *aio_context, ++ JobTxn *txn, Error **errp); + + static void drive_backup_prepare(BlkActionState *common, Error **errp) + { + DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common); +- BlockDriverState *bs; + DriveBackup *backup; ++ BlockDriverState *bs; ++ BlockDriverState *target_bs; ++ BlockDriverState *source = NULL; + AioContext *aio_context; ++ QDict *options; + Error *local_err = NULL; ++ int flags; ++ int64_t size; ++ bool set_backing_hd = false; + + assert(common->action->type == TRANSACTION_ACTION_KIND_DRIVE_BACKUP); + backup = common->action->u.drive_backup.data; + ++ if (!backup->has_mode) { ++ backup->mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; ++ } ++ + bs = bdrv_lookup_bs(backup->device, backup->device, errp); + if (!bs) { + return; + } + ++ if (!bs->drv) { ++ error_setg(errp, "Device has no medium"); ++ return; ++ } ++ + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + /* Paired with .clean() */ + bdrv_drained_begin(bs); + +- state->bs = bs; ++ if (!backup->has_format) { ++ backup->format = backup->mode == NEW_IMAGE_MODE_EXISTING ? ++ NULL : (char *) bs->drv->format_name; ++ } ++ ++ /* Early check to avoid creating target */ ++ if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { ++ goto out; ++ } ++ ++ flags = bs->open_flags | BDRV_O_RDWR; ++ ++ /* ++ * See if we have a backing HD we can use to create our new image ++ * on top of. ++ */ ++ if (backup->sync == MIRROR_SYNC_MODE_TOP) { ++ source = backing_bs(bs); ++ if (!source) { ++ backup->sync = MIRROR_SYNC_MODE_FULL; ++ } ++ } ++ if (backup->sync == MIRROR_SYNC_MODE_NONE) { ++ source = bs; ++ flags |= BDRV_O_NO_BACKING; ++ set_backing_hd = true; ++ } ++ ++ size = bdrv_getlength(bs); ++ if (size < 0) { ++ error_setg_errno(errp, -size, "bdrv_getlength failed"); ++ goto out; ++ } ++ ++ if (backup->mode != NEW_IMAGE_MODE_EXISTING) { ++ assert(backup->format); ++ if (source) { ++ bdrv_refresh_filename(source); ++ bdrv_img_create(backup->target, backup->format, source->filename, ++ source->drv->format_name, NULL, ++ size, flags, false, &local_err); ++ } else { ++ bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL, ++ size, flags, false, &local_err); ++ } ++ } + +- state->job = do_drive_backup(backup, common->block_job_txn, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto out; + } + ++ options = qdict_new(); ++ qdict_put_str(options, "discard", "unmap"); ++ qdict_put_str(options, "detect-zeroes", "unmap"); ++ if (backup->format) { ++ qdict_put_str(options, "driver", backup->format); ++ } ++ ++ target_bs = bdrv_open(backup->target, NULL, options, flags, errp); ++ if (!target_bs) { ++ goto out; ++ } ++ ++ if (set_backing_hd) { ++ bdrv_set_backing_hd(target_bs, source, &local_err); ++ if (local_err) { ++ goto unref; ++ } ++ } ++ ++ state->bs = bs; ++ ++ state->job = do_backup_common(qapi_DriveBackup_base(backup), ++ bs, target_bs, aio_context, ++ common->block_job_txn, errp); ++ ++unref: ++ bdrv_unref(target_bs); + out: + aio_context_release(aio_context); + } +@@ -3587,126 +3676,13 @@ static BlockJob *do_backup_common(BackupCommon *backup, + return job; + } + +-static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, +- Error **errp) +-{ +- BlockDriverState *bs; +- BlockDriverState *target_bs; +- BlockDriverState *source = NULL; +- BlockJob *job = NULL; +- AioContext *aio_context; +- QDict *options; +- Error *local_err = NULL; +- int flags; +- int64_t size; +- bool set_backing_hd = false; +- +- if (!backup->has_mode) { +- backup->mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; +- } +- +- bs = bdrv_lookup_bs(backup->device, backup->device, errp); +- if (!bs) { +- return NULL; +- } +- +- if (!bs->drv) { +- error_setg(errp, "Device has no medium"); +- return NULL; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- if (!backup->has_format) { +- backup->format = backup->mode == NEW_IMAGE_MODE_EXISTING ? +- NULL : (char *) bs->drv->format_name; +- } +- +- /* Early check to avoid creating target */ +- if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { +- goto out; +- } +- +- flags = bs->open_flags | BDRV_O_RDWR; +- +- /* +- * See if we have a backing HD we can use to create our new image +- * on top of. +- */ +- if (backup->sync == MIRROR_SYNC_MODE_TOP) { +- source = backing_bs(bs); +- if (!source) { +- backup->sync = MIRROR_SYNC_MODE_FULL; +- } +- } +- if (backup->sync == MIRROR_SYNC_MODE_NONE) { +- source = bs; +- flags |= BDRV_O_NO_BACKING; +- set_backing_hd = true; +- } +- +- size = bdrv_getlength(bs); +- if (size < 0) { +- error_setg_errno(errp, -size, "bdrv_getlength failed"); +- goto out; +- } +- +- if (backup->mode != NEW_IMAGE_MODE_EXISTING) { +- assert(backup->format); +- if (source) { +- bdrv_refresh_filename(source); +- bdrv_img_create(backup->target, backup->format, source->filename, +- source->drv->format_name, NULL, +- size, flags, false, &local_err); +- } else { +- bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL, +- size, flags, false, &local_err); +- } +- } +- +- if (local_err) { +- error_propagate(errp, local_err); +- goto out; +- } +- +- options = qdict_new(); +- qdict_put_str(options, "discard", "unmap"); +- qdict_put_str(options, "detect-zeroes", "unmap"); +- if (backup->format) { +- qdict_put_str(options, "driver", backup->format); +- } +- +- target_bs = bdrv_open(backup->target, NULL, options, flags, errp); +- if (!target_bs) { +- goto out; +- } +- +- if (set_backing_hd) { +- bdrv_set_backing_hd(target_bs, source, &local_err); +- if (local_err) { +- goto unref; +- } +- } +- +- job = do_backup_common(qapi_DriveBackup_base(backup), +- bs, target_bs, aio_context, txn, errp); +- +-unref: +- bdrv_unref(target_bs); +-out: +- aio_context_release(aio_context); +- return job; +-} +- +-void qmp_drive_backup(DriveBackup *arg, Error **errp) ++void qmp_drive_backup(DriveBackup *backup, Error **errp) + { +- +- BlockJob *job; +- job = do_drive_backup(arg, NULL, errp); +- if (job) { +- job_start(&job->job); +- } ++ TransactionAction action = { ++ .type = TRANSACTION_ACTION_KIND_DRIVE_BACKUP, ++ .u.drive_backup.data = backup, ++ }; ++ blockdev_do_action(&action, errp); + } + + BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) +diff --git a/tests/qemu-iotests/141.out b/tests/qemu-iotests/141.out +index 3645675..263b680 100644 +--- a/tests/qemu-iotests/141.out ++++ b/tests/qemu-iotests/141.out +@@ -13,6 +13,8 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/m. + Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "job0"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}} + {'execute': 'blockdev-del', 'arguments': {'node-name': 'drv0'}} + {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: node is used as backing hd of 'NODE_NAME'"}} + {'execute': 'block-job-cancel', 'arguments': {'device': 'job0'}} +diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out +index 8379ac5..9a3b657 100644 +--- a/tests/qemu-iotests/185.out ++++ b/tests/qemu-iotests/185.out +@@ -65,6 +65,8 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 l + Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 lazy_refcounts=off refcount_bits=16 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "disk"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "disk"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk"}} + {"return": {}} + { 'execute': 'quit' } + {"return": {}} +diff --git a/tests/qemu-iotests/219 b/tests/qemu-iotests/219 +index e0c5166..655f54d 100755 +--- a/tests/qemu-iotests/219 ++++ b/tests/qemu-iotests/219 +@@ -63,7 +63,7 @@ def test_pause_resume(vm): + # logged immediately + iotests.log(vm.qmp('query-jobs')) + +-def test_job_lifecycle(vm, job, job_args, has_ready=False): ++def test_job_lifecycle(vm, job, job_args, has_ready=False, is_mirror=False): + global img_size + + iotests.log('') +@@ -135,6 +135,9 @@ def test_job_lifecycle(vm, job, job_args, has_ready=False): + iotests.log('Waiting for PENDING state...') + iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) + iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) ++ if is_mirror: ++ iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) ++ iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) + + if not job_args.get('auto-finalize', True): + # PENDING state: +@@ -218,7 +221,7 @@ with iotests.FilePath('disk.img') as disk_path, \ + + for auto_finalize in [True, False]: + for auto_dismiss in [True, False]: +- test_job_lifecycle(vm, 'drive-backup', job_args={ ++ test_job_lifecycle(vm, 'drive-backup', is_mirror=True, job_args={ + 'device': 'drive0-node', + 'target': copy_path, + 'sync': 'full', +diff --git a/tests/qemu-iotests/219.out b/tests/qemu-iotests/219.out +index 8ebd3fe..0ea5d0b 100644 +--- a/tests/qemu-iotests/219.out ++++ b/tests/qemu-iotests/219.out +@@ -135,6 +135,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "concluded"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} +@@ -186,6 +188,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "concluded"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} +@@ -245,6 +249,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"return": [{"current-progress": 4194304, "id": "job0", "status": "pending", "total-progress": 4194304, "type": "backup"}]} +@@ -304,6 +310,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"return": [{"current-progress": 4194304, "id": "job0", "status": "pending", "total-progress": 4194304, "type": "backup"}]} +-- +1.8.3.1 + diff --git a/kvm-bootp-check-bootp_input-buffer-size.patch b/kvm-bootp-check-bootp_input-buffer-size.patch new file mode 100755 index 0000000..3362cb0 --- /dev/null +++ b/kvm-bootp-check-bootp_input-buffer-size.patch @@ -0,0 +1,52 @@ +From a66ab346bf74ebf3ed8fca0dc2e2febfe70069e8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:28 -0400 +Subject: [PATCH 07/14] bootp: check bootp_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-4-marcandre.lureau@redhat.com> +Patchwork-id: 101820 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 3/8] bootp: check bootp_input buffer size +Bugzilla: 1970819 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3592 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/44 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970819 + +(cherry picked from commit 2eca0838eee1da96204545e22cdaed860d9d7c6c) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/bootp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/slirp/src/bootp.c b/slirp/src/bootp.c +index 5754327138..5789187166 100644 +--- a/slirp/src/bootp.c ++++ b/slirp/src/bootp.c +@@ -366,9 +366,9 @@ udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + + void bootp_input(struct mbuf *m) + { +- struct bootp_t *bp = mtod(m, struct bootp_t *); ++ struct bootp_t *bp = mtod_check(m, sizeof(struct bootp_t)); + +- if (bp->bp_op == BOOTP_REQUEST) { ++ if (bp && bp->bp_op == BOOTP_REQUEST) { + bootp_reply(m->slirp, bp, m_end(m)); + } + } +-- +2.27.0 + diff --git a/kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch b/kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch new file mode 100755 index 0000000..bbf9b03 --- /dev/null +++ b/kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch @@ -0,0 +1,175 @@ +From 8198ae7c21a4d37f7e365058f973867c41d44d21 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:25 -0400 +Subject: [PATCH 06/14] bootp: limit vendor-specific area to input packet + memory buffer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-3-marcandre.lureau@redhat.com> +Patchwork-id: 101821 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/8] bootp: limit vendor-specific area to input packet memory buffer +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +sizeof(bootp_t) currently holds DHCP_OPT_LEN. Remove this optional field +from the structure, to help with the following patch checking for +minimal header size. Modify the bootp_reply() function to take the +buffer boundaries and avoiding potential buffer overflow. + +Related to CVE-2021-3592. + +https://gitlab.freedesktop.org/slirp/libslirp/-/issues/44 + +Signed-off-by: Marc-André Lureau + +(cherry picked from commit f13cad45b25d92760bb0ad67bec0300a4d7d5275) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/bootp.c | 26 +++++++++++++++----------- + slirp/src/bootp.h | 2 +- + slirp/src/mbuf.c | 5 +++++ + slirp/src/mbuf.h | 1 + + 4 files changed, 22 insertions(+), 12 deletions(-) + +diff --git a/slirp/src/bootp.c b/slirp/src/bootp.c +index 3f9ce2553e..5754327138 100644 +--- a/slirp/src/bootp.c ++++ b/slirp/src/bootp.c +@@ -92,21 +92,22 @@ found: + return bc; + } + +-static void dhcp_decode(const struct bootp_t *bp, int *pmsg_type, ++static void dhcp_decode(const struct bootp_t *bp, ++ const uint8_t *bp_end, ++ int *pmsg_type, + struct in_addr *preq_addr) + { +- const uint8_t *p, *p_end; ++ const uint8_t *p; + int len, tag; + + *pmsg_type = 0; + preq_addr->s_addr = htonl(0L); + + p = bp->bp_vend; +- p_end = p + DHCP_OPT_LEN; + if (memcmp(p, rfc1533_cookie, 4) != 0) + return; + p += 4; +- while (p < p_end) { ++ while (p < bp_end) { + tag = p[0]; + if (tag == RFC1533_PAD) { + p++; +@@ -114,10 +115,10 @@ static void dhcp_decode(const struct bootp_t *bp, int *pmsg_type, + break; + } else { + p++; +- if (p >= p_end) ++ if (p >= bp_end) + break; + len = *p++; +- if (p + len > p_end) { ++ if (p + len > bp_end) { + break; + } + DPRINTF("dhcp: tag=%d len=%d\n", tag, len); +@@ -144,7 +145,9 @@ static void dhcp_decode(const struct bootp_t *bp, int *pmsg_type, + } + } + +-static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) ++static void bootp_reply(Slirp *slirp, ++ const struct bootp_t *bp, ++ const uint8_t *bp_end) + { + BOOTPClient *bc = NULL; + struct mbuf *m; +@@ -157,7 +160,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) + uint8_t client_ethaddr[ETH_ALEN]; + + /* extract exact DHCP msg type */ +- dhcp_decode(bp, &dhcp_msg_type, &preq_addr); ++ dhcp_decode(bp, bp_end, &dhcp_msg_type, &preq_addr); + DPRINTF("bootp packet op=%d msgtype=%d", bp->bp_op, dhcp_msg_type); + if (preq_addr.s_addr != htonl(0L)) + DPRINTF(" req_addr=%08" PRIx32 "\n", ntohl(preq_addr.s_addr)); +@@ -179,9 +182,10 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) + return; + } + m->m_data += IF_MAXLINKHDR; ++ m_inc(m, sizeof(struct bootp_t) + DHCP_OPT_LEN); + rbp = (struct bootp_t *)m->m_data; + m->m_data += sizeof(struct udpiphdr); +- memset(rbp, 0, sizeof(struct bootp_t)); ++ memset(rbp, 0, sizeof(struct bootp_t) + DHCP_OPT_LEN); + + if (dhcp_msg_type == DHCPDISCOVER) { + if (preq_addr.s_addr != htonl(0L)) { +@@ -235,7 +239,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) + rbp->bp_siaddr = saddr.sin_addr; /* Server IP address */ + + q = rbp->bp_vend; +- end = (uint8_t *)&rbp[1]; ++ end = rbp->bp_vend + DHCP_OPT_LEN; + memcpy(q, rfc1533_cookie, 4); + q += 4; + +@@ -365,6 +369,6 @@ void bootp_input(struct mbuf *m) + struct bootp_t *bp = mtod(m, struct bootp_t *); + + if (bp->bp_op == BOOTP_REQUEST) { +- bootp_reply(m->slirp, bp); ++ bootp_reply(m->slirp, bp, m_end(m)); + } + } +diff --git a/slirp/src/bootp.h b/slirp/src/bootp.h +index 03ece9bf28..0d20a944a8 100644 +--- a/slirp/src/bootp.h ++++ b/slirp/src/bootp.h +@@ -114,7 +114,7 @@ struct bootp_t { + uint8_t bp_hwaddr[16]; + uint8_t bp_sname[64]; + uint8_t bp_file[128]; +- uint8_t bp_vend[DHCP_OPT_LEN]; ++ uint8_t bp_vend[]; + }; + + typedef struct { +diff --git a/slirp/src/mbuf.c b/slirp/src/mbuf.c +index 6d0653ed3d..7db07c088e 100644 +--- a/slirp/src/mbuf.c ++++ b/slirp/src/mbuf.c +@@ -233,3 +233,8 @@ void *mtod_check(struct mbuf *m, size_t len) + + return NULL; + } ++ ++void *m_end(struct mbuf *m) ++{ ++ return m->m_data + m->m_len; ++} +diff --git a/slirp/src/mbuf.h b/slirp/src/mbuf.h +index 2015e3232f..a9752a36e0 100644 +--- a/slirp/src/mbuf.h ++++ b/slirp/src/mbuf.h +@@ -119,6 +119,7 @@ void m_adj(struct mbuf *, int); + int m_copy(struct mbuf *, struct mbuf *, int, int); + struct mbuf *dtom(Slirp *, void *); + void *mtod_check(struct mbuf *, size_t len); ++void *m_end(struct mbuf *); + + static inline void ifs_init(struct mbuf *ifm) + { +-- +2.27.0 + diff --git a/kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch b/kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch new file mode 100755 index 0000000..5d21bf8 --- /dev/null +++ b/kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch @@ -0,0 +1,137 @@ +From f756c1c4590a37c533ec0429644a7034ba35dada Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:38 +0100 +Subject: [PATCH 007/116] build: rename CONFIG_LIBCAP to CONFIG_LIBCAP_NG +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-4-dgilbert@redhat.com> +Patchwork-id: 93459 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 003/112] build: rename CONFIG_LIBCAP to CONFIG_LIBCAP_NG +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Paolo Bonzini + +Since we are actually testing for the newer capng library, rename the +symbol to match. + +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Paolo Bonzini +(cherry picked from commit a358bca24026a377e0804e137a4499e4e041918d) +Signed-off-by: Miroslav Rezanina +--- + configure | 2 +- + qemu-bridge-helper.c | 6 +++--- + scsi/qemu-pr-helper.c | 12 ++++++------ + 3 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/configure b/configure +index 16564f8..7831618 100755 +--- a/configure ++++ b/configure +@@ -6760,7 +6760,7 @@ if test "$l2tpv3" = "yes" ; then + echo "CONFIG_L2TPV3=y" >> $config_host_mak + fi + if test "$cap_ng" = "yes" ; then +- echo "CONFIG_LIBCAP=y" >> $config_host_mak ++ echo "CONFIG_LIBCAP_NG=y" >> $config_host_mak + fi + echo "CONFIG_AUDIO_DRIVERS=$audio_drv_list" >> $config_host_mak + for drv in $audio_drv_list; do +diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c +index 3d50ec0..88b2674 100644 +--- a/qemu-bridge-helper.c ++++ b/qemu-bridge-helper.c +@@ -43,7 +43,7 @@ + + #include "net/tap-linux.h" + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + #include + #endif + +@@ -207,7 +207,7 @@ static int send_fd(int c, int fd) + return sendmsg(c, &msg, 0); + } + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + static int drop_privileges(void) + { + /* clear all capabilities */ +@@ -246,7 +246,7 @@ int main(int argc, char **argv) + int access_allowed, access_denied; + int ret = EXIT_SUCCESS; + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + /* if we're run from an suid binary, immediately drop privileges preserving + * cap_net_admin */ + if (geteuid() == 0 && getuid() != geteuid()) { +diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c +index debb18f..0659cee 100644 +--- a/scsi/qemu-pr-helper.c ++++ b/scsi/qemu-pr-helper.c +@@ -24,7 +24,7 @@ + #include + #include + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + #include + #endif + #include +@@ -70,7 +70,7 @@ static int num_active_sockets = 1; + static int noisy; + static int verbose; + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + static int uid = -1; + static int gid = -1; + #endif +@@ -97,7 +97,7 @@ static void usage(const char *name) + " (default '%s')\n" + " -T, --trace [[enable=]][,events=][,file=]\n" + " specify tracing options\n" +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + " -u, --user=USER user to drop privileges to\n" + " -g, --group=GROUP group to drop privileges to\n" + #endif +@@ -827,7 +827,7 @@ static void close_server_socket(void) + num_active_sockets--; + } + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + static int drop_privileges(void) + { + /* clear all capabilities */ +@@ -920,7 +920,7 @@ int main(int argc, char **argv) + pidfile = g_strdup(optarg); + pidfile_specified = true; + break; +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + case 'u': { + unsigned long res; + struct passwd *userinfo = getpwnam(optarg); +@@ -1056,7 +1056,7 @@ int main(int argc, char **argv) + exit(EXIT_FAILURE); + } + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + if (drop_privileges() < 0) { + error_report("Failed to drop privileges: %s", strerror(errno)); + exit(EXIT_FAILURE); +-- +1.8.3.1 + diff --git a/kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch b/kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch new file mode 100755 index 0000000..5b1b170 --- /dev/null +++ b/kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch @@ -0,0 +1,2463 @@ +From fc2d0dfe60b14992a9b67e7a18394ba6365dc5ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 18 Mar 2020 18:10:40 +0000 +Subject: [PATCH 2/2] build-sys: do not make qemu-ga link with pixman +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200318181040.256425-1-marcandre.lureau@redhat.com> +Patchwork-id: 94381 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] build-sys: do not make qemu-ga link with pixman +Bugzilla: 1811670 +RH-Acked-by: Markus Armbruster +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange + +Since commit d52c454aadcdae74506f315ebf8b58bb79a05573 ("contrib: add +vhost-user-gpu"), qemu-ga is linking with pixman. + +This is because the Make-based build-system use a global namespace for +variables, and we rely on "main.o-libs" for different linking targets. + +Note: this kind of variable clashing is hard to fix or prevent +currently. meson should help, as declarations have a linear +dependency and doesn't rely so much on variables and clever tricks. + +Note2: we have a lot of main.c (or other duplicated names!) in +tree. Imho, it would be annoying and a bad workaroud to rename all +those to avoid conflicts like I did here. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1811670 + +Signed-off-by: Marc-André Lureau +Message-Id: <20200311160923.882474-1-marcandre.lureau@redhat.com> +Signed-off-by: Paolo Bonzini + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1811670 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=27330493 + +(cherry picked from commit 5b42bc5ce9ab4a3171819feea5042931817211fd) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + contrib/vhost-user-gpu/Makefile.objs | 6 +- + contrib/vhost-user-gpu/main.c | 1191 ------------------------------- + contrib/vhost-user-gpu/vhost-user-gpu.c | 1191 +++++++++++++++++++++++++++++++ + 3 files changed, 1194 insertions(+), 1194 deletions(-) + delete mode 100644 contrib/vhost-user-gpu/main.c + create mode 100644 contrib/vhost-user-gpu/vhost-user-gpu.c + +diff --git a/contrib/vhost-user-gpu/Makefile.objs b/contrib/vhost-user-gpu/Makefile.objs +index 6170c91..0929609 100644 +--- a/contrib/vhost-user-gpu/Makefile.objs ++++ b/contrib/vhost-user-gpu/Makefile.objs +@@ -1,7 +1,7 @@ +-vhost-user-gpu-obj-y = main.o virgl.o vugbm.o ++vhost-user-gpu-obj-y = vhost-user-gpu.o virgl.o vugbm.o + +-main.o-cflags := $(PIXMAN_CFLAGS) $(GBM_CFLAGS) +-main.o-libs := $(PIXMAN_LIBS) ++vhost-user-gpu.o-cflags := $(PIXMAN_CFLAGS) $(GBM_CFLAGS) ++vhost-user-gpu.o-libs := $(PIXMAN_LIBS) + + virgl.o-cflags := $(VIRGL_CFLAGS) $(GBM_CFLAGS) + virgl.o-libs := $(VIRGL_LIBS) +diff --git a/contrib/vhost-user-gpu/main.c b/contrib/vhost-user-gpu/main.c +deleted file mode 100644 +index b45d201..0000000 +--- a/contrib/vhost-user-gpu/main.c ++++ /dev/null +@@ -1,1191 +0,0 @@ +-/* +- * Virtio vhost-user GPU Device +- * +- * Copyright Red Hat, Inc. 2013-2018 +- * +- * Authors: +- * Dave Airlie +- * Gerd Hoffmann +- * Marc-André Lureau +- * +- * This work is licensed under the terms of the GNU GPL, version 2 or later. +- * See the COPYING file in the top-level directory. +- */ +-#include "qemu/osdep.h" +-#include "qemu/drm.h" +-#include "qapi/error.h" +-#include "qemu/sockets.h" +- +-#include +-#include +- +-#include "vugpu.h" +-#include "hw/virtio/virtio-gpu-bswap.h" +-#include "hw/virtio/virtio-gpu-pixman.h" +-#include "virgl.h" +-#include "vugbm.h" +- +-enum { +- VHOST_USER_GPU_MAX_QUEUES = 2, +-}; +- +-struct virtio_gpu_simple_resource { +- uint32_t resource_id; +- uint32_t width; +- uint32_t height; +- uint32_t format; +- struct iovec *iov; +- unsigned int iov_cnt; +- uint32_t scanout_bitmask; +- pixman_image_t *image; +- struct vugbm_buffer buffer; +- QTAILQ_ENTRY(virtio_gpu_simple_resource) next; +-}; +- +-static gboolean opt_print_caps; +-static int opt_fdnum = -1; +-static char *opt_socket_path; +-static char *opt_render_node; +-static gboolean opt_virgl; +- +-static void vg_handle_ctrl(VuDev *dev, int qidx); +- +-static const char * +-vg_cmd_to_string(int cmd) +-{ +-#define CMD(cmd) [cmd] = #cmd +- static const char *vg_cmd_str[] = { +- CMD(VIRTIO_GPU_UNDEFINED), +- +- /* 2d commands */ +- CMD(VIRTIO_GPU_CMD_GET_DISPLAY_INFO), +- CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_2D), +- CMD(VIRTIO_GPU_CMD_RESOURCE_UNREF), +- CMD(VIRTIO_GPU_CMD_SET_SCANOUT), +- CMD(VIRTIO_GPU_CMD_RESOURCE_FLUSH), +- CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D), +- CMD(VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING), +- CMD(VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING), +- CMD(VIRTIO_GPU_CMD_GET_CAPSET_INFO), +- CMD(VIRTIO_GPU_CMD_GET_CAPSET), +- +- /* 3d commands */ +- CMD(VIRTIO_GPU_CMD_CTX_CREATE), +- CMD(VIRTIO_GPU_CMD_CTX_DESTROY), +- CMD(VIRTIO_GPU_CMD_CTX_ATTACH_RESOURCE), +- CMD(VIRTIO_GPU_CMD_CTX_DETACH_RESOURCE), +- CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_3D), +- CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D), +- CMD(VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D), +- CMD(VIRTIO_GPU_CMD_SUBMIT_3D), +- +- /* cursor commands */ +- CMD(VIRTIO_GPU_CMD_UPDATE_CURSOR), +- CMD(VIRTIO_GPU_CMD_MOVE_CURSOR), +- }; +-#undef REQ +- +- if (cmd >= 0 && cmd < G_N_ELEMENTS(vg_cmd_str)) { +- return vg_cmd_str[cmd]; +- } else { +- return "unknown"; +- } +-} +- +-static int +-vg_sock_fd_read(int sock, void *buf, ssize_t buflen) +-{ +- int ret; +- +- do { +- ret = read(sock, buf, buflen); +- } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); +- +- g_warn_if_fail(ret == buflen); +- return ret; +-} +- +-static void +-vg_sock_fd_close(VuGpu *g) +-{ +- if (g->sock_fd >= 0) { +- close(g->sock_fd); +- g->sock_fd = -1; +- } +-} +- +-static gboolean +-source_wait_cb(gint fd, GIOCondition condition, gpointer user_data) +-{ +- VuGpu *g = user_data; +- +- if (!vg_recv_msg(g, VHOST_USER_GPU_DMABUF_UPDATE, 0, NULL)) { +- return G_SOURCE_CONTINUE; +- } +- +- /* resume */ +- g->wait_ok = 0; +- vg_handle_ctrl(&g->dev.parent, 0); +- +- return G_SOURCE_REMOVE; +-} +- +-void +-vg_wait_ok(VuGpu *g) +-{ +- assert(g->wait_ok == 0); +- g->wait_ok = g_unix_fd_add(g->sock_fd, G_IO_IN | G_IO_HUP, +- source_wait_cb, g); +-} +- +-static int +-vg_sock_fd_write(int sock, const void *buf, ssize_t buflen, int fd) +-{ +- ssize_t ret; +- struct iovec iov = { +- .iov_base = (void *)buf, +- .iov_len = buflen, +- }; +- struct msghdr msg = { +- .msg_iov = &iov, +- .msg_iovlen = 1, +- }; +- union { +- struct cmsghdr cmsghdr; +- char control[CMSG_SPACE(sizeof(int))]; +- } cmsgu; +- struct cmsghdr *cmsg; +- +- if (fd != -1) { +- msg.msg_control = cmsgu.control; +- msg.msg_controllen = sizeof(cmsgu.control); +- +- cmsg = CMSG_FIRSTHDR(&msg); +- cmsg->cmsg_len = CMSG_LEN(sizeof(int)); +- cmsg->cmsg_level = SOL_SOCKET; +- cmsg->cmsg_type = SCM_RIGHTS; +- +- *((int *)CMSG_DATA(cmsg)) = fd; +- } +- +- do { +- ret = sendmsg(sock, &msg, 0); +- } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); +- +- g_warn_if_fail(ret == buflen); +- return ret; +-} +- +-void +-vg_send_msg(VuGpu *vg, const VhostUserGpuMsg *msg, int fd) +-{ +- if (vg_sock_fd_write(vg->sock_fd, msg, +- VHOST_USER_GPU_HDR_SIZE + msg->size, fd) < 0) { +- vg_sock_fd_close(vg); +- } +-} +- +-bool +-vg_recv_msg(VuGpu *g, uint32_t expect_req, uint32_t expect_size, +- gpointer payload) +-{ +- uint32_t req, flags, size; +- +- if (vg_sock_fd_read(g->sock_fd, &req, sizeof(req)) < 0 || +- vg_sock_fd_read(g->sock_fd, &flags, sizeof(flags)) < 0 || +- vg_sock_fd_read(g->sock_fd, &size, sizeof(size)) < 0) { +- goto err; +- } +- +- g_return_val_if_fail(req == expect_req, false); +- g_return_val_if_fail(flags & VHOST_USER_GPU_MSG_FLAG_REPLY, false); +- g_return_val_if_fail(size == expect_size, false); +- +- if (size && vg_sock_fd_read(g->sock_fd, payload, size) != size) { +- goto err; +- } +- +- return true; +- +-err: +- vg_sock_fd_close(g); +- return false; +-} +- +-static struct virtio_gpu_simple_resource * +-virtio_gpu_find_resource(VuGpu *g, uint32_t resource_id) +-{ +- struct virtio_gpu_simple_resource *res; +- +- QTAILQ_FOREACH(res, &g->reslist, next) { +- if (res->resource_id == resource_id) { +- return res; +- } +- } +- return NULL; +-} +- +-void +-vg_ctrl_response(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd, +- struct virtio_gpu_ctrl_hdr *resp, +- size_t resp_len) +-{ +- size_t s; +- +- if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_FENCE) { +- resp->flags |= VIRTIO_GPU_FLAG_FENCE; +- resp->fence_id = cmd->cmd_hdr.fence_id; +- resp->ctx_id = cmd->cmd_hdr.ctx_id; +- } +- virtio_gpu_ctrl_hdr_bswap(resp); +- s = iov_from_buf(cmd->elem.in_sg, cmd->elem.in_num, 0, resp, resp_len); +- if (s != resp_len) { +- g_critical("%s: response size incorrect %zu vs %zu", +- __func__, s, resp_len); +- } +- vu_queue_push(&g->dev.parent, cmd->vq, &cmd->elem, s); +- vu_queue_notify(&g->dev.parent, cmd->vq); +- cmd->finished = true; +-} +- +-void +-vg_ctrl_response_nodata(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd, +- enum virtio_gpu_ctrl_type type) +-{ +- struct virtio_gpu_ctrl_hdr resp = { +- .type = type, +- }; +- +- vg_ctrl_response(g, cmd, &resp, sizeof(resp)); +-} +- +-void +-vg_get_display_info(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_resp_display_info dpy_info = { {} }; +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_GET_DISPLAY_INFO, +- .size = 0, +- }; +- +- assert(vg->wait_ok == 0); +- +- vg_send_msg(vg, &msg, -1); +- if (!vg_recv_msg(vg, msg.request, sizeof(dpy_info), &dpy_info)) { +- return; +- } +- +- vg_ctrl_response(vg, cmd, &dpy_info.hdr, sizeof(dpy_info)); +-} +- +-static void +-vg_resource_create_2d(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- pixman_format_code_t pformat; +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_create_2d c2d; +- +- VUGPU_FILL_CMD(c2d); +- virtio_gpu_bswap_32(&c2d, sizeof(c2d)); +- +- if (c2d.resource_id == 0) { +- g_critical("%s: resource id 0 is not allowed", __func__); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- res = virtio_gpu_find_resource(g, c2d.resource_id); +- if (res) { +- g_critical("%s: resource already exists %d", __func__, c2d.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- res = g_new0(struct virtio_gpu_simple_resource, 1); +- res->width = c2d.width; +- res->height = c2d.height; +- res->format = c2d.format; +- res->resource_id = c2d.resource_id; +- +- pformat = virtio_gpu_get_pixman_format(c2d.format); +- if (!pformat) { +- g_critical("%s: host couldn't handle guest format %d", +- __func__, c2d.format); +- g_free(res); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- vugbm_buffer_create(&res->buffer, &g->gdev, c2d.width, c2d.height); +- res->image = pixman_image_create_bits(pformat, +- c2d.width, +- c2d.height, +- (uint32_t *)res->buffer.mmap, +- res->buffer.stride); +- if (!res->image) { +- g_critical("%s: resource creation failed %d %d %d", +- __func__, c2d.resource_id, c2d.width, c2d.height); +- g_free(res); +- cmd->error = VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY; +- return; +- } +- +- QTAILQ_INSERT_HEAD(&g->reslist, res, next); +-} +- +-static void +-vg_disable_scanout(VuGpu *g, int scanout_id) +-{ +- struct virtio_gpu_scanout *scanout = &g->scanout[scanout_id]; +- struct virtio_gpu_simple_resource *res; +- +- if (scanout->resource_id == 0) { +- return; +- } +- +- res = virtio_gpu_find_resource(g, scanout->resource_id); +- if (res) { +- res->scanout_bitmask &= ~(1 << scanout_id); +- } +- +- scanout->width = 0; +- scanout->height = 0; +- +- if (g->sock_fd >= 0) { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_SCANOUT, +- .size = sizeof(VhostUserGpuScanout), +- .payload.scanout.scanout_id = scanout_id, +- }; +- vg_send_msg(g, &msg, -1); +- } +-} +- +-static void +-vg_resource_destroy(VuGpu *g, +- struct virtio_gpu_simple_resource *res) +-{ +- int i; +- +- if (res->scanout_bitmask) { +- for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { +- if (res->scanout_bitmask & (1 << i)) { +- vg_disable_scanout(g, i); +- } +- } +- } +- +- vugbm_buffer_destroy(&res->buffer); +- pixman_image_unref(res->image); +- QTAILQ_REMOVE(&g->reslist, res, next); +- g_free(res); +-} +- +-static void +-vg_resource_unref(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_unref unref; +- +- VUGPU_FILL_CMD(unref); +- virtio_gpu_bswap_32(&unref, sizeof(unref)); +- +- res = virtio_gpu_find_resource(g, unref.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d", +- __func__, unref.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- vg_resource_destroy(g, res); +-} +- +-int +-vg_create_mapping_iov(VuGpu *g, +- struct virtio_gpu_resource_attach_backing *ab, +- struct virtio_gpu_ctrl_command *cmd, +- struct iovec **iov) +-{ +- struct virtio_gpu_mem_entry *ents; +- size_t esize, s; +- int i; +- +- if (ab->nr_entries > 16384) { +- g_critical("%s: nr_entries is too big (%d > 16384)", +- __func__, ab->nr_entries); +- return -1; +- } +- +- esize = sizeof(*ents) * ab->nr_entries; +- ents = g_malloc(esize); +- s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, +- sizeof(*ab), ents, esize); +- if (s != esize) { +- g_critical("%s: command data size incorrect %zu vs %zu", +- __func__, s, esize); +- g_free(ents); +- return -1; +- } +- +- *iov = g_malloc0(sizeof(struct iovec) * ab->nr_entries); +- for (i = 0; i < ab->nr_entries; i++) { +- uint64_t len = ents[i].length; +- (*iov)[i].iov_len = ents[i].length; +- (*iov)[i].iov_base = vu_gpa_to_va(&g->dev.parent, &len, ents[i].addr); +- if (!(*iov)[i].iov_base || len != ents[i].length) { +- g_critical("%s: resource %d element %d", +- __func__, ab->resource_id, i); +- g_free(*iov); +- g_free(ents); +- *iov = NULL; +- return -1; +- } +- } +- g_free(ents); +- return 0; +-} +- +-static void +-vg_resource_attach_backing(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_attach_backing ab; +- int ret; +- +- VUGPU_FILL_CMD(ab); +- virtio_gpu_bswap_32(&ab, sizeof(ab)); +- +- res = virtio_gpu_find_resource(g, ab.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d", +- __func__, ab.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- ret = vg_create_mapping_iov(g, &ab, cmd, &res->iov); +- if (ret != 0) { +- cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; +- return; +- } +- +- res->iov_cnt = ab.nr_entries; +-} +- +-static void +-vg_resource_detach_backing(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_detach_backing detach; +- +- VUGPU_FILL_CMD(detach); +- virtio_gpu_bswap_32(&detach, sizeof(detach)); +- +- res = virtio_gpu_find_resource(g, detach.resource_id); +- if (!res || !res->iov) { +- g_critical("%s: illegal resource specified %d", +- __func__, detach.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- g_free(res->iov); +- res->iov = NULL; +- res->iov_cnt = 0; +-} +- +-static void +-vg_transfer_to_host_2d(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- int h; +- uint32_t src_offset, dst_offset, stride; +- int bpp; +- pixman_format_code_t format; +- struct virtio_gpu_transfer_to_host_2d t2d; +- +- VUGPU_FILL_CMD(t2d); +- virtio_gpu_t2d_bswap(&t2d); +- +- res = virtio_gpu_find_resource(g, t2d.resource_id); +- if (!res || !res->iov) { +- g_critical("%s: illegal resource specified %d", +- __func__, t2d.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- if (t2d.r.x > res->width || +- t2d.r.y > res->height || +- t2d.r.width > res->width || +- t2d.r.height > res->height || +- t2d.r.x + t2d.r.width > res->width || +- t2d.r.y + t2d.r.height > res->height) { +- g_critical("%s: transfer bounds outside resource" +- " bounds for resource %d: %d %d %d %d vs %d %d", +- __func__, t2d.resource_id, t2d.r.x, t2d.r.y, +- t2d.r.width, t2d.r.height, res->width, res->height); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- +- format = pixman_image_get_format(res->image); +- bpp = (PIXMAN_FORMAT_BPP(format) + 7) / 8; +- stride = pixman_image_get_stride(res->image); +- +- if (t2d.offset || t2d.r.x || t2d.r.y || +- t2d.r.width != pixman_image_get_width(res->image)) { +- void *img_data = pixman_image_get_data(res->image); +- for (h = 0; h < t2d.r.height; h++) { +- src_offset = t2d.offset + stride * h; +- dst_offset = (t2d.r.y + h) * stride + (t2d.r.x * bpp); +- +- iov_to_buf(res->iov, res->iov_cnt, src_offset, +- img_data +- + dst_offset, t2d.r.width * bpp); +- } +- } else { +- iov_to_buf(res->iov, res->iov_cnt, 0, +- pixman_image_get_data(res->image), +- pixman_image_get_stride(res->image) +- * pixman_image_get_height(res->image)); +- } +-} +- +-static void +-vg_set_scanout(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res, *ores; +- struct virtio_gpu_scanout *scanout; +- struct virtio_gpu_set_scanout ss; +- int fd; +- +- VUGPU_FILL_CMD(ss); +- virtio_gpu_bswap_32(&ss, sizeof(ss)); +- +- if (ss.scanout_id >= VIRTIO_GPU_MAX_SCANOUTS) { +- g_critical("%s: illegal scanout id specified %d", +- __func__, ss.scanout_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID; +- return; +- } +- +- if (ss.resource_id == 0) { +- vg_disable_scanout(g, ss.scanout_id); +- return; +- } +- +- /* create a surface for this scanout */ +- res = virtio_gpu_find_resource(g, ss.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d", +- __func__, ss.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- if (ss.r.x > res->width || +- ss.r.y > res->height || +- ss.r.width > res->width || +- ss.r.height > res->height || +- ss.r.x + ss.r.width > res->width || +- ss.r.y + ss.r.height > res->height) { +- g_critical("%s: illegal scanout %d bounds for" +- " resource %d, (%d,%d)+%d,%d vs %d %d", +- __func__, ss.scanout_id, ss.resource_id, ss.r.x, ss.r.y, +- ss.r.width, ss.r.height, res->width, res->height); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- +- scanout = &g->scanout[ss.scanout_id]; +- +- ores = virtio_gpu_find_resource(g, scanout->resource_id); +- if (ores) { +- ores->scanout_bitmask &= ~(1 << ss.scanout_id); +- } +- +- res->scanout_bitmask |= (1 << ss.scanout_id); +- scanout->resource_id = ss.resource_id; +- scanout->x = ss.r.x; +- scanout->y = ss.r.y; +- scanout->width = ss.r.width; +- scanout->height = ss.r.height; +- +- struct vugbm_buffer *buffer = &res->buffer; +- +- if (vugbm_buffer_can_get_dmabuf_fd(buffer)) { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_DMABUF_SCANOUT, +- .size = sizeof(VhostUserGpuDMABUFScanout), +- .payload.dmabuf_scanout = (VhostUserGpuDMABUFScanout) { +- .scanout_id = ss.scanout_id, +- .x = ss.r.x, +- .y = ss.r.y, +- .width = ss.r.width, +- .height = ss.r.height, +- .fd_width = buffer->width, +- .fd_height = buffer->height, +- .fd_stride = buffer->stride, +- .fd_drm_fourcc = buffer->format +- } +- }; +- +- if (vugbm_buffer_get_dmabuf_fd(buffer, &fd)) { +- vg_send_msg(g, &msg, fd); +- close(fd); +- } +- } else { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_SCANOUT, +- .size = sizeof(VhostUserGpuScanout), +- .payload.scanout = (VhostUserGpuScanout) { +- .scanout_id = ss.scanout_id, +- .width = scanout->width, +- .height = scanout->height +- } +- }; +- vg_send_msg(g, &msg, -1); +- } +-} +- +-static void +-vg_resource_flush(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_flush rf; +- pixman_region16_t flush_region; +- int i; +- +- VUGPU_FILL_CMD(rf); +- virtio_gpu_bswap_32(&rf, sizeof(rf)); +- +- res = virtio_gpu_find_resource(g, rf.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d\n", +- __func__, rf.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- if (rf.r.x > res->width || +- rf.r.y > res->height || +- rf.r.width > res->width || +- rf.r.height > res->height || +- rf.r.x + rf.r.width > res->width || +- rf.r.y + rf.r.height > res->height) { +- g_critical("%s: flush bounds outside resource" +- " bounds for resource %d: %d %d %d %d vs %d %d\n", +- __func__, rf.resource_id, rf.r.x, rf.r.y, +- rf.r.width, rf.r.height, res->width, res->height); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- +- pixman_region_init_rect(&flush_region, +- rf.r.x, rf.r.y, rf.r.width, rf.r.height); +- for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { +- struct virtio_gpu_scanout *scanout; +- pixman_region16_t region, finalregion; +- pixman_box16_t *extents; +- +- if (!(res->scanout_bitmask & (1 << i))) { +- continue; +- } +- scanout = &g->scanout[i]; +- +- pixman_region_init(&finalregion); +- pixman_region_init_rect(®ion, scanout->x, scanout->y, +- scanout->width, scanout->height); +- +- pixman_region_intersect(&finalregion, &flush_region, ®ion); +- +- extents = pixman_region_extents(&finalregion); +- size_t width = extents->x2 - extents->x1; +- size_t height = extents->y2 - extents->y1; +- +- if (vugbm_buffer_can_get_dmabuf_fd(&res->buffer)) { +- VhostUserGpuMsg vmsg = { +- .request = VHOST_USER_GPU_DMABUF_UPDATE, +- .size = sizeof(VhostUserGpuUpdate), +- .payload.update = (VhostUserGpuUpdate) { +- .scanout_id = i, +- .x = extents->x1, +- .y = extents->y1, +- .width = width, +- .height = height, +- } +- }; +- vg_send_msg(g, &vmsg, -1); +- vg_wait_ok(g); +- } else { +- size_t bpp = +- PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) / 8; +- size_t size = width * height * bpp; +- +- void *p = g_malloc(VHOST_USER_GPU_HDR_SIZE + +- sizeof(VhostUserGpuUpdate) + size); +- VhostUserGpuMsg *msg = p; +- msg->request = VHOST_USER_GPU_UPDATE; +- msg->size = sizeof(VhostUserGpuUpdate) + size; +- msg->payload.update = (VhostUserGpuUpdate) { +- .scanout_id = i, +- .x = extents->x1, +- .y = extents->y1, +- .width = width, +- .height = height, +- }; +- pixman_image_t *i = +- pixman_image_create_bits(pixman_image_get_format(res->image), +- msg->payload.update.width, +- msg->payload.update.height, +- p + offsetof(VhostUserGpuMsg, +- payload.update.data), +- width * bpp); +- pixman_image_composite(PIXMAN_OP_SRC, +- res->image, NULL, i, +- extents->x1, extents->y1, +- 0, 0, 0, 0, +- width, height); +- pixman_image_unref(i); +- vg_send_msg(g, msg, -1); +- g_free(msg); +- } +- pixman_region_fini(®ion); +- pixman_region_fini(&finalregion); +- } +- pixman_region_fini(&flush_region); +-} +- +-static void +-vg_process_cmd(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) +-{ +- switch (cmd->cmd_hdr.type) { +- case VIRTIO_GPU_CMD_GET_DISPLAY_INFO: +- vg_get_display_info(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_CREATE_2D: +- vg_resource_create_2d(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_UNREF: +- vg_resource_unref(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_FLUSH: +- vg_resource_flush(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D: +- vg_transfer_to_host_2d(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_SET_SCANOUT: +- vg_set_scanout(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING: +- vg_resource_attach_backing(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING: +- vg_resource_detach_backing(vg, cmd); +- break; +- /* case VIRTIO_GPU_CMD_GET_EDID: */ +- /* break */ +- default: +- g_warning("TODO handle ctrl %x\n", cmd->cmd_hdr.type); +- cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; +- break; +- } +- if (!cmd->finished) { +- vg_ctrl_response_nodata(vg, cmd, cmd->error ? cmd->error : +- VIRTIO_GPU_RESP_OK_NODATA); +- } +-} +- +-static void +-vg_handle_ctrl(VuDev *dev, int qidx) +-{ +- VuGpu *vg = container_of(dev, VuGpu, dev.parent); +- VuVirtq *vq = vu_get_queue(dev, qidx); +- struct virtio_gpu_ctrl_command *cmd = NULL; +- size_t len; +- +- for (;;) { +- if (vg->wait_ok != 0) { +- return; +- } +- +- cmd = vu_queue_pop(dev, vq, sizeof(struct virtio_gpu_ctrl_command)); +- if (!cmd) { +- break; +- } +- cmd->vq = vq; +- cmd->error = 0; +- cmd->finished = false; +- +- len = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, +- 0, &cmd->cmd_hdr, sizeof(cmd->cmd_hdr)); +- if (len != sizeof(cmd->cmd_hdr)) { +- g_warning("%s: command size incorrect %zu vs %zu\n", +- __func__, len, sizeof(cmd->cmd_hdr)); +- } +- +- virtio_gpu_ctrl_hdr_bswap(&cmd->cmd_hdr); +- g_debug("%d %s\n", cmd->cmd_hdr.type, +- vg_cmd_to_string(cmd->cmd_hdr.type)); +- +- if (vg->virgl) { +- vg_virgl_process_cmd(vg, cmd); +- } else { +- vg_process_cmd(vg, cmd); +- } +- +- if (!cmd->finished) { +- QTAILQ_INSERT_TAIL(&vg->fenceq, cmd, next); +- vg->inflight++; +- } else { +- g_free(cmd); +- } +- } +-} +- +-static void +-update_cursor_data_simple(VuGpu *g, uint32_t resource_id, gpointer data) +-{ +- struct virtio_gpu_simple_resource *res; +- +- res = virtio_gpu_find_resource(g, resource_id); +- g_return_if_fail(res != NULL); +- g_return_if_fail(pixman_image_get_width(res->image) == 64); +- g_return_if_fail(pixman_image_get_height(res->image) == 64); +- g_return_if_fail( +- PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) == 32); +- +- memcpy(data, pixman_image_get_data(res->image), 64 * 64 * sizeof(uint32_t)); +-} +- +-static void +-vg_process_cursor_cmd(VuGpu *g, struct virtio_gpu_update_cursor *cursor) +-{ +- bool move = cursor->hdr.type != VIRTIO_GPU_CMD_MOVE_CURSOR; +- +- g_debug("%s move:%d\n", G_STRFUNC, move); +- +- if (move) { +- VhostUserGpuMsg msg = { +- .request = cursor->resource_id ? +- VHOST_USER_GPU_CURSOR_POS : VHOST_USER_GPU_CURSOR_POS_HIDE, +- .size = sizeof(VhostUserGpuCursorPos), +- .payload.cursor_pos = { +- .scanout_id = cursor->pos.scanout_id, +- .x = cursor->pos.x, +- .y = cursor->pos.y, +- } +- }; +- vg_send_msg(g, &msg, -1); +- } else { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_CURSOR_UPDATE, +- .size = sizeof(VhostUserGpuCursorUpdate), +- .payload.cursor_update = { +- .pos = { +- .scanout_id = cursor->pos.scanout_id, +- .x = cursor->pos.x, +- .y = cursor->pos.y, +- }, +- .hot_x = cursor->hot_x, +- .hot_y = cursor->hot_y, +- } +- }; +- if (g->virgl) { +- vg_virgl_update_cursor_data(g, cursor->resource_id, +- msg.payload.cursor_update.data); +- } else { +- update_cursor_data_simple(g, cursor->resource_id, +- msg.payload.cursor_update.data); +- } +- vg_send_msg(g, &msg, -1); +- } +-} +- +-static void +-vg_handle_cursor(VuDev *dev, int qidx) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- VuVirtq *vq = vu_get_queue(dev, qidx); +- VuVirtqElement *elem; +- size_t len; +- struct virtio_gpu_update_cursor cursor; +- +- for (;;) { +- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); +- if (!elem) { +- break; +- } +- g_debug("cursor out:%d in:%d\n", elem->out_num, elem->in_num); +- +- len = iov_to_buf(elem->out_sg, elem->out_num, +- 0, &cursor, sizeof(cursor)); +- if (len != sizeof(cursor)) { +- g_warning("%s: cursor size incorrect %zu vs %zu\n", +- __func__, len, sizeof(cursor)); +- } else { +- virtio_gpu_bswap_32(&cursor, sizeof(cursor)); +- vg_process_cursor_cmd(g, &cursor); +- } +- vu_queue_push(dev, vq, elem, 0); +- vu_queue_notify(dev, vq); +- g_free(elem); +- } +-} +- +-static void +-vg_panic(VuDev *dev, const char *msg) +-{ +- g_critical("%s\n", msg); +- exit(1); +-} +- +-static void +-vg_queue_set_started(VuDev *dev, int qidx, bool started) +-{ +- VuVirtq *vq = vu_get_queue(dev, qidx); +- +- g_debug("queue started %d:%d\n", qidx, started); +- +- switch (qidx) { +- case 0: +- vu_set_queue_handler(dev, vq, started ? vg_handle_ctrl : NULL); +- break; +- case 1: +- vu_set_queue_handler(dev, vq, started ? vg_handle_cursor : NULL); +- break; +- default: +- break; +- } +-} +- +-static void +-set_gpu_protocol_features(VuGpu *g) +-{ +- uint64_t u64; +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_GET_PROTOCOL_FEATURES +- }; +- +- assert(g->wait_ok == 0); +- vg_send_msg(g, &msg, -1); +- if (!vg_recv_msg(g, msg.request, sizeof(u64), &u64)) { +- return; +- } +- +- msg = (VhostUserGpuMsg) { +- .request = VHOST_USER_GPU_SET_PROTOCOL_FEATURES, +- .size = sizeof(uint64_t), +- .payload.u64 = 0 +- }; +- vg_send_msg(g, &msg, -1); +-} +- +-static int +-vg_process_msg(VuDev *dev, VhostUserMsg *msg, int *do_reply) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- +- switch (msg->request) { +- case VHOST_USER_GPU_SET_SOCKET: { +- g_return_val_if_fail(msg->fd_num == 1, 1); +- g_return_val_if_fail(g->sock_fd == -1, 1); +- g->sock_fd = msg->fds[0]; +- set_gpu_protocol_features(g); +- return 1; +- } +- default: +- return 0; +- } +- +- return 0; +-} +- +-static uint64_t +-vg_get_features(VuDev *dev) +-{ +- uint64_t features = 0; +- +- if (opt_virgl) { +- features |= 1 << VIRTIO_GPU_F_VIRGL; +- } +- +- return features; +-} +- +-static void +-vg_set_features(VuDev *dev, uint64_t features) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- bool virgl = features & (1 << VIRTIO_GPU_F_VIRGL); +- +- if (virgl && !g->virgl_inited) { +- if (!vg_virgl_init(g)) { +- vg_panic(dev, "Failed to initialize virgl"); +- } +- g->virgl_inited = true; +- } +- +- g->virgl = virgl; +-} +- +-static int +-vg_get_config(VuDev *dev, uint8_t *config, uint32_t len) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- +- g_return_val_if_fail(len <= sizeof(struct virtio_gpu_config), -1); +- +- if (opt_virgl) { +- g->virtio_config.num_capsets = vg_virgl_get_num_capsets(); +- } +- +- memcpy(config, &g->virtio_config, len); +- +- return 0; +-} +- +-static int +-vg_set_config(VuDev *dev, const uint8_t *data, +- uint32_t offset, uint32_t size, +- uint32_t flags) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- struct virtio_gpu_config *config = (struct virtio_gpu_config *)data; +- +- if (config->events_clear) { +- g->virtio_config.events_read &= ~config->events_clear; +- } +- +- return 0; +-} +- +-static const VuDevIface vuiface = { +- .set_features = vg_set_features, +- .get_features = vg_get_features, +- .queue_set_started = vg_queue_set_started, +- .process_msg = vg_process_msg, +- .get_config = vg_get_config, +- .set_config = vg_set_config, +-}; +- +-static void +-vg_destroy(VuGpu *g) +-{ +- struct virtio_gpu_simple_resource *res, *tmp; +- +- vug_deinit(&g->dev); +- +- vg_sock_fd_close(g); +- +- QTAILQ_FOREACH_SAFE(res, &g->reslist, next, tmp) { +- vg_resource_destroy(g, res); +- } +- +- vugbm_device_destroy(&g->gdev); +-} +- +-static GOptionEntry entries[] = { +- { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps, +- "Print capabilities", NULL }, +- { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum, +- "Use inherited fd socket", "FDNUM" }, +- { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path, +- "Use UNIX socket path", "PATH" }, +- { "render-node", 'r', 0, G_OPTION_ARG_FILENAME, &opt_render_node, +- "Specify DRM render node", "PATH" }, +- { "virgl", 'v', 0, G_OPTION_ARG_NONE, &opt_virgl, +- "Turn virgl rendering on", NULL }, +- { NULL, } +-}; +- +-int +-main(int argc, char *argv[]) +-{ +- GOptionContext *context; +- GError *error = NULL; +- GMainLoop *loop = NULL; +- int fd; +- VuGpu g = { .sock_fd = -1, .drm_rnode_fd = -1 }; +- +- QTAILQ_INIT(&g.reslist); +- QTAILQ_INIT(&g.fenceq); +- +- context = g_option_context_new("QEMU vhost-user-gpu"); +- g_option_context_add_main_entries(context, entries, NULL); +- if (!g_option_context_parse(context, &argc, &argv, &error)) { +- g_printerr("Option parsing failed: %s\n", error->message); +- exit(EXIT_FAILURE); +- } +- g_option_context_free(context); +- +- if (opt_print_caps) { +- g_print("{\n"); +- g_print(" \"type\": \"gpu\",\n"); +- g_print(" \"features\": [\n"); +- g_print(" \"render-node\",\n"); +- g_print(" \"virgl\"\n"); +- g_print(" ]\n"); +- g_print("}\n"); +- exit(EXIT_SUCCESS); +- } +- +- g.drm_rnode_fd = qemu_drm_rendernode_open(opt_render_node); +- if (opt_render_node && g.drm_rnode_fd == -1) { +- g_printerr("Failed to open DRM rendernode.\n"); +- exit(EXIT_FAILURE); +- } +- +- if (g.drm_rnode_fd >= 0) { +- if (!vugbm_device_init(&g.gdev, g.drm_rnode_fd)) { +- g_warning("Failed to init DRM device, using fallback path"); +- } +- } +- +- if ((!!opt_socket_path + (opt_fdnum != -1)) != 1) { +- g_printerr("Please specify either --fd or --socket-path\n"); +- exit(EXIT_FAILURE); +- } +- +- if (opt_socket_path) { +- int lsock = unix_listen(opt_socket_path, &error_fatal); +- if (lsock < 0) { +- g_printerr("Failed to listen on %s.\n", opt_socket_path); +- exit(EXIT_FAILURE); +- } +- fd = accept(lsock, NULL, NULL); +- close(lsock); +- } else { +- fd = opt_fdnum; +- } +- if (fd == -1) { +- g_printerr("Invalid vhost-user socket.\n"); +- exit(EXIT_FAILURE); +- } +- +- if (!vug_init(&g.dev, VHOST_USER_GPU_MAX_QUEUES, fd, vg_panic, &vuiface)) { +- g_printerr("Failed to initialize libvhost-user-glib.\n"); +- exit(EXIT_FAILURE); +- } +- +- loop = g_main_loop_new(NULL, FALSE); +- g_main_loop_run(loop); +- g_main_loop_unref(loop); +- +- vg_destroy(&g); +- if (g.drm_rnode_fd >= 0) { +- close(g.drm_rnode_fd); +- } +- +- return 0; +-} +diff --git a/contrib/vhost-user-gpu/vhost-user-gpu.c b/contrib/vhost-user-gpu/vhost-user-gpu.c +new file mode 100644 +index 0000000..b45d201 +--- /dev/null ++++ b/contrib/vhost-user-gpu/vhost-user-gpu.c +@@ -0,0 +1,1191 @@ ++/* ++ * Virtio vhost-user GPU Device ++ * ++ * Copyright Red Hat, Inc. 2013-2018 ++ * ++ * Authors: ++ * Dave Airlie ++ * Gerd Hoffmann ++ * Marc-André Lureau ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++#include "qemu/osdep.h" ++#include "qemu/drm.h" ++#include "qapi/error.h" ++#include "qemu/sockets.h" ++ ++#include ++#include ++ ++#include "vugpu.h" ++#include "hw/virtio/virtio-gpu-bswap.h" ++#include "hw/virtio/virtio-gpu-pixman.h" ++#include "virgl.h" ++#include "vugbm.h" ++ ++enum { ++ VHOST_USER_GPU_MAX_QUEUES = 2, ++}; ++ ++struct virtio_gpu_simple_resource { ++ uint32_t resource_id; ++ uint32_t width; ++ uint32_t height; ++ uint32_t format; ++ struct iovec *iov; ++ unsigned int iov_cnt; ++ uint32_t scanout_bitmask; ++ pixman_image_t *image; ++ struct vugbm_buffer buffer; ++ QTAILQ_ENTRY(virtio_gpu_simple_resource) next; ++}; ++ ++static gboolean opt_print_caps; ++static int opt_fdnum = -1; ++static char *opt_socket_path; ++static char *opt_render_node; ++static gboolean opt_virgl; ++ ++static void vg_handle_ctrl(VuDev *dev, int qidx); ++ ++static const char * ++vg_cmd_to_string(int cmd) ++{ ++#define CMD(cmd) [cmd] = #cmd ++ static const char *vg_cmd_str[] = { ++ CMD(VIRTIO_GPU_UNDEFINED), ++ ++ /* 2d commands */ ++ CMD(VIRTIO_GPU_CMD_GET_DISPLAY_INFO), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_2D), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_UNREF), ++ CMD(VIRTIO_GPU_CMD_SET_SCANOUT), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_FLUSH), ++ CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING), ++ CMD(VIRTIO_GPU_CMD_GET_CAPSET_INFO), ++ CMD(VIRTIO_GPU_CMD_GET_CAPSET), ++ ++ /* 3d commands */ ++ CMD(VIRTIO_GPU_CMD_CTX_CREATE), ++ CMD(VIRTIO_GPU_CMD_CTX_DESTROY), ++ CMD(VIRTIO_GPU_CMD_CTX_ATTACH_RESOURCE), ++ CMD(VIRTIO_GPU_CMD_CTX_DETACH_RESOURCE), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_3D), ++ CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D), ++ CMD(VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D), ++ CMD(VIRTIO_GPU_CMD_SUBMIT_3D), ++ ++ /* cursor commands */ ++ CMD(VIRTIO_GPU_CMD_UPDATE_CURSOR), ++ CMD(VIRTIO_GPU_CMD_MOVE_CURSOR), ++ }; ++#undef REQ ++ ++ if (cmd >= 0 && cmd < G_N_ELEMENTS(vg_cmd_str)) { ++ return vg_cmd_str[cmd]; ++ } else { ++ return "unknown"; ++ } ++} ++ ++static int ++vg_sock_fd_read(int sock, void *buf, ssize_t buflen) ++{ ++ int ret; ++ ++ do { ++ ret = read(sock, buf, buflen); ++ } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); ++ ++ g_warn_if_fail(ret == buflen); ++ return ret; ++} ++ ++static void ++vg_sock_fd_close(VuGpu *g) ++{ ++ if (g->sock_fd >= 0) { ++ close(g->sock_fd); ++ g->sock_fd = -1; ++ } ++} ++ ++static gboolean ++source_wait_cb(gint fd, GIOCondition condition, gpointer user_data) ++{ ++ VuGpu *g = user_data; ++ ++ if (!vg_recv_msg(g, VHOST_USER_GPU_DMABUF_UPDATE, 0, NULL)) { ++ return G_SOURCE_CONTINUE; ++ } ++ ++ /* resume */ ++ g->wait_ok = 0; ++ vg_handle_ctrl(&g->dev.parent, 0); ++ ++ return G_SOURCE_REMOVE; ++} ++ ++void ++vg_wait_ok(VuGpu *g) ++{ ++ assert(g->wait_ok == 0); ++ g->wait_ok = g_unix_fd_add(g->sock_fd, G_IO_IN | G_IO_HUP, ++ source_wait_cb, g); ++} ++ ++static int ++vg_sock_fd_write(int sock, const void *buf, ssize_t buflen, int fd) ++{ ++ ssize_t ret; ++ struct iovec iov = { ++ .iov_base = (void *)buf, ++ .iov_len = buflen, ++ }; ++ struct msghdr msg = { ++ .msg_iov = &iov, ++ .msg_iovlen = 1, ++ }; ++ union { ++ struct cmsghdr cmsghdr; ++ char control[CMSG_SPACE(sizeof(int))]; ++ } cmsgu; ++ struct cmsghdr *cmsg; ++ ++ if (fd != -1) { ++ msg.msg_control = cmsgu.control; ++ msg.msg_controllen = sizeof(cmsgu.control); ++ ++ cmsg = CMSG_FIRSTHDR(&msg); ++ cmsg->cmsg_len = CMSG_LEN(sizeof(int)); ++ cmsg->cmsg_level = SOL_SOCKET; ++ cmsg->cmsg_type = SCM_RIGHTS; ++ ++ *((int *)CMSG_DATA(cmsg)) = fd; ++ } ++ ++ do { ++ ret = sendmsg(sock, &msg, 0); ++ } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); ++ ++ g_warn_if_fail(ret == buflen); ++ return ret; ++} ++ ++void ++vg_send_msg(VuGpu *vg, const VhostUserGpuMsg *msg, int fd) ++{ ++ if (vg_sock_fd_write(vg->sock_fd, msg, ++ VHOST_USER_GPU_HDR_SIZE + msg->size, fd) < 0) { ++ vg_sock_fd_close(vg); ++ } ++} ++ ++bool ++vg_recv_msg(VuGpu *g, uint32_t expect_req, uint32_t expect_size, ++ gpointer payload) ++{ ++ uint32_t req, flags, size; ++ ++ if (vg_sock_fd_read(g->sock_fd, &req, sizeof(req)) < 0 || ++ vg_sock_fd_read(g->sock_fd, &flags, sizeof(flags)) < 0 || ++ vg_sock_fd_read(g->sock_fd, &size, sizeof(size)) < 0) { ++ goto err; ++ } ++ ++ g_return_val_if_fail(req == expect_req, false); ++ g_return_val_if_fail(flags & VHOST_USER_GPU_MSG_FLAG_REPLY, false); ++ g_return_val_if_fail(size == expect_size, false); ++ ++ if (size && vg_sock_fd_read(g->sock_fd, payload, size) != size) { ++ goto err; ++ } ++ ++ return true; ++ ++err: ++ vg_sock_fd_close(g); ++ return false; ++} ++ ++static struct virtio_gpu_simple_resource * ++virtio_gpu_find_resource(VuGpu *g, uint32_t resource_id) ++{ ++ struct virtio_gpu_simple_resource *res; ++ ++ QTAILQ_FOREACH(res, &g->reslist, next) { ++ if (res->resource_id == resource_id) { ++ return res; ++ } ++ } ++ return NULL; ++} ++ ++void ++vg_ctrl_response(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd, ++ struct virtio_gpu_ctrl_hdr *resp, ++ size_t resp_len) ++{ ++ size_t s; ++ ++ if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_FENCE) { ++ resp->flags |= VIRTIO_GPU_FLAG_FENCE; ++ resp->fence_id = cmd->cmd_hdr.fence_id; ++ resp->ctx_id = cmd->cmd_hdr.ctx_id; ++ } ++ virtio_gpu_ctrl_hdr_bswap(resp); ++ s = iov_from_buf(cmd->elem.in_sg, cmd->elem.in_num, 0, resp, resp_len); ++ if (s != resp_len) { ++ g_critical("%s: response size incorrect %zu vs %zu", ++ __func__, s, resp_len); ++ } ++ vu_queue_push(&g->dev.parent, cmd->vq, &cmd->elem, s); ++ vu_queue_notify(&g->dev.parent, cmd->vq); ++ cmd->finished = true; ++} ++ ++void ++vg_ctrl_response_nodata(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd, ++ enum virtio_gpu_ctrl_type type) ++{ ++ struct virtio_gpu_ctrl_hdr resp = { ++ .type = type, ++ }; ++ ++ vg_ctrl_response(g, cmd, &resp, sizeof(resp)); ++} ++ ++void ++vg_get_display_info(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_resp_display_info dpy_info = { {} }; ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_GET_DISPLAY_INFO, ++ .size = 0, ++ }; ++ ++ assert(vg->wait_ok == 0); ++ ++ vg_send_msg(vg, &msg, -1); ++ if (!vg_recv_msg(vg, msg.request, sizeof(dpy_info), &dpy_info)) { ++ return; ++ } ++ ++ vg_ctrl_response(vg, cmd, &dpy_info.hdr, sizeof(dpy_info)); ++} ++ ++static void ++vg_resource_create_2d(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ pixman_format_code_t pformat; ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_create_2d c2d; ++ ++ VUGPU_FILL_CMD(c2d); ++ virtio_gpu_bswap_32(&c2d, sizeof(c2d)); ++ ++ if (c2d.resource_id == 0) { ++ g_critical("%s: resource id 0 is not allowed", __func__); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ res = virtio_gpu_find_resource(g, c2d.resource_id); ++ if (res) { ++ g_critical("%s: resource already exists %d", __func__, c2d.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ res = g_new0(struct virtio_gpu_simple_resource, 1); ++ res->width = c2d.width; ++ res->height = c2d.height; ++ res->format = c2d.format; ++ res->resource_id = c2d.resource_id; ++ ++ pformat = virtio_gpu_get_pixman_format(c2d.format); ++ if (!pformat) { ++ g_critical("%s: host couldn't handle guest format %d", ++ __func__, c2d.format); ++ g_free(res); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ vugbm_buffer_create(&res->buffer, &g->gdev, c2d.width, c2d.height); ++ res->image = pixman_image_create_bits(pformat, ++ c2d.width, ++ c2d.height, ++ (uint32_t *)res->buffer.mmap, ++ res->buffer.stride); ++ if (!res->image) { ++ g_critical("%s: resource creation failed %d %d %d", ++ __func__, c2d.resource_id, c2d.width, c2d.height); ++ g_free(res); ++ cmd->error = VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY; ++ return; ++ } ++ ++ QTAILQ_INSERT_HEAD(&g->reslist, res, next); ++} ++ ++static void ++vg_disable_scanout(VuGpu *g, int scanout_id) ++{ ++ struct virtio_gpu_scanout *scanout = &g->scanout[scanout_id]; ++ struct virtio_gpu_simple_resource *res; ++ ++ if (scanout->resource_id == 0) { ++ return; ++ } ++ ++ res = virtio_gpu_find_resource(g, scanout->resource_id); ++ if (res) { ++ res->scanout_bitmask &= ~(1 << scanout_id); ++ } ++ ++ scanout->width = 0; ++ scanout->height = 0; ++ ++ if (g->sock_fd >= 0) { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_SCANOUT, ++ .size = sizeof(VhostUserGpuScanout), ++ .payload.scanout.scanout_id = scanout_id, ++ }; ++ vg_send_msg(g, &msg, -1); ++ } ++} ++ ++static void ++vg_resource_destroy(VuGpu *g, ++ struct virtio_gpu_simple_resource *res) ++{ ++ int i; ++ ++ if (res->scanout_bitmask) { ++ for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { ++ if (res->scanout_bitmask & (1 << i)) { ++ vg_disable_scanout(g, i); ++ } ++ } ++ } ++ ++ vugbm_buffer_destroy(&res->buffer); ++ pixman_image_unref(res->image); ++ QTAILQ_REMOVE(&g->reslist, res, next); ++ g_free(res); ++} ++ ++static void ++vg_resource_unref(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_unref unref; ++ ++ VUGPU_FILL_CMD(unref); ++ virtio_gpu_bswap_32(&unref, sizeof(unref)); ++ ++ res = virtio_gpu_find_resource(g, unref.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, unref.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ vg_resource_destroy(g, res); ++} ++ ++int ++vg_create_mapping_iov(VuGpu *g, ++ struct virtio_gpu_resource_attach_backing *ab, ++ struct virtio_gpu_ctrl_command *cmd, ++ struct iovec **iov) ++{ ++ struct virtio_gpu_mem_entry *ents; ++ size_t esize, s; ++ int i; ++ ++ if (ab->nr_entries > 16384) { ++ g_critical("%s: nr_entries is too big (%d > 16384)", ++ __func__, ab->nr_entries); ++ return -1; ++ } ++ ++ esize = sizeof(*ents) * ab->nr_entries; ++ ents = g_malloc(esize); ++ s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, ++ sizeof(*ab), ents, esize); ++ if (s != esize) { ++ g_critical("%s: command data size incorrect %zu vs %zu", ++ __func__, s, esize); ++ g_free(ents); ++ return -1; ++ } ++ ++ *iov = g_malloc0(sizeof(struct iovec) * ab->nr_entries); ++ for (i = 0; i < ab->nr_entries; i++) { ++ uint64_t len = ents[i].length; ++ (*iov)[i].iov_len = ents[i].length; ++ (*iov)[i].iov_base = vu_gpa_to_va(&g->dev.parent, &len, ents[i].addr); ++ if (!(*iov)[i].iov_base || len != ents[i].length) { ++ g_critical("%s: resource %d element %d", ++ __func__, ab->resource_id, i); ++ g_free(*iov); ++ g_free(ents); ++ *iov = NULL; ++ return -1; ++ } ++ } ++ g_free(ents); ++ return 0; ++} ++ ++static void ++vg_resource_attach_backing(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_attach_backing ab; ++ int ret; ++ ++ VUGPU_FILL_CMD(ab); ++ virtio_gpu_bswap_32(&ab, sizeof(ab)); ++ ++ res = virtio_gpu_find_resource(g, ab.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, ab.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ ret = vg_create_mapping_iov(g, &ab, cmd, &res->iov); ++ if (ret != 0) { ++ cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; ++ return; ++ } ++ ++ res->iov_cnt = ab.nr_entries; ++} ++ ++static void ++vg_resource_detach_backing(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_detach_backing detach; ++ ++ VUGPU_FILL_CMD(detach); ++ virtio_gpu_bswap_32(&detach, sizeof(detach)); ++ ++ res = virtio_gpu_find_resource(g, detach.resource_id); ++ if (!res || !res->iov) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, detach.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ g_free(res->iov); ++ res->iov = NULL; ++ res->iov_cnt = 0; ++} ++ ++static void ++vg_transfer_to_host_2d(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ int h; ++ uint32_t src_offset, dst_offset, stride; ++ int bpp; ++ pixman_format_code_t format; ++ struct virtio_gpu_transfer_to_host_2d t2d; ++ ++ VUGPU_FILL_CMD(t2d); ++ virtio_gpu_t2d_bswap(&t2d); ++ ++ res = virtio_gpu_find_resource(g, t2d.resource_id); ++ if (!res || !res->iov) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, t2d.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ if (t2d.r.x > res->width || ++ t2d.r.y > res->height || ++ t2d.r.width > res->width || ++ t2d.r.height > res->height || ++ t2d.r.x + t2d.r.width > res->width || ++ t2d.r.y + t2d.r.height > res->height) { ++ g_critical("%s: transfer bounds outside resource" ++ " bounds for resource %d: %d %d %d %d vs %d %d", ++ __func__, t2d.resource_id, t2d.r.x, t2d.r.y, ++ t2d.r.width, t2d.r.height, res->width, res->height); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ ++ format = pixman_image_get_format(res->image); ++ bpp = (PIXMAN_FORMAT_BPP(format) + 7) / 8; ++ stride = pixman_image_get_stride(res->image); ++ ++ if (t2d.offset || t2d.r.x || t2d.r.y || ++ t2d.r.width != pixman_image_get_width(res->image)) { ++ void *img_data = pixman_image_get_data(res->image); ++ for (h = 0; h < t2d.r.height; h++) { ++ src_offset = t2d.offset + stride * h; ++ dst_offset = (t2d.r.y + h) * stride + (t2d.r.x * bpp); ++ ++ iov_to_buf(res->iov, res->iov_cnt, src_offset, ++ img_data ++ + dst_offset, t2d.r.width * bpp); ++ } ++ } else { ++ iov_to_buf(res->iov, res->iov_cnt, 0, ++ pixman_image_get_data(res->image), ++ pixman_image_get_stride(res->image) ++ * pixman_image_get_height(res->image)); ++ } ++} ++ ++static void ++vg_set_scanout(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res, *ores; ++ struct virtio_gpu_scanout *scanout; ++ struct virtio_gpu_set_scanout ss; ++ int fd; ++ ++ VUGPU_FILL_CMD(ss); ++ virtio_gpu_bswap_32(&ss, sizeof(ss)); ++ ++ if (ss.scanout_id >= VIRTIO_GPU_MAX_SCANOUTS) { ++ g_critical("%s: illegal scanout id specified %d", ++ __func__, ss.scanout_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID; ++ return; ++ } ++ ++ if (ss.resource_id == 0) { ++ vg_disable_scanout(g, ss.scanout_id); ++ return; ++ } ++ ++ /* create a surface for this scanout */ ++ res = virtio_gpu_find_resource(g, ss.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, ss.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ if (ss.r.x > res->width || ++ ss.r.y > res->height || ++ ss.r.width > res->width || ++ ss.r.height > res->height || ++ ss.r.x + ss.r.width > res->width || ++ ss.r.y + ss.r.height > res->height) { ++ g_critical("%s: illegal scanout %d bounds for" ++ " resource %d, (%d,%d)+%d,%d vs %d %d", ++ __func__, ss.scanout_id, ss.resource_id, ss.r.x, ss.r.y, ++ ss.r.width, ss.r.height, res->width, res->height); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ ++ scanout = &g->scanout[ss.scanout_id]; ++ ++ ores = virtio_gpu_find_resource(g, scanout->resource_id); ++ if (ores) { ++ ores->scanout_bitmask &= ~(1 << ss.scanout_id); ++ } ++ ++ res->scanout_bitmask |= (1 << ss.scanout_id); ++ scanout->resource_id = ss.resource_id; ++ scanout->x = ss.r.x; ++ scanout->y = ss.r.y; ++ scanout->width = ss.r.width; ++ scanout->height = ss.r.height; ++ ++ struct vugbm_buffer *buffer = &res->buffer; ++ ++ if (vugbm_buffer_can_get_dmabuf_fd(buffer)) { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_DMABUF_SCANOUT, ++ .size = sizeof(VhostUserGpuDMABUFScanout), ++ .payload.dmabuf_scanout = (VhostUserGpuDMABUFScanout) { ++ .scanout_id = ss.scanout_id, ++ .x = ss.r.x, ++ .y = ss.r.y, ++ .width = ss.r.width, ++ .height = ss.r.height, ++ .fd_width = buffer->width, ++ .fd_height = buffer->height, ++ .fd_stride = buffer->stride, ++ .fd_drm_fourcc = buffer->format ++ } ++ }; ++ ++ if (vugbm_buffer_get_dmabuf_fd(buffer, &fd)) { ++ vg_send_msg(g, &msg, fd); ++ close(fd); ++ } ++ } else { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_SCANOUT, ++ .size = sizeof(VhostUserGpuScanout), ++ .payload.scanout = (VhostUserGpuScanout) { ++ .scanout_id = ss.scanout_id, ++ .width = scanout->width, ++ .height = scanout->height ++ } ++ }; ++ vg_send_msg(g, &msg, -1); ++ } ++} ++ ++static void ++vg_resource_flush(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_flush rf; ++ pixman_region16_t flush_region; ++ int i; ++ ++ VUGPU_FILL_CMD(rf); ++ virtio_gpu_bswap_32(&rf, sizeof(rf)); ++ ++ res = virtio_gpu_find_resource(g, rf.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d\n", ++ __func__, rf.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ if (rf.r.x > res->width || ++ rf.r.y > res->height || ++ rf.r.width > res->width || ++ rf.r.height > res->height || ++ rf.r.x + rf.r.width > res->width || ++ rf.r.y + rf.r.height > res->height) { ++ g_critical("%s: flush bounds outside resource" ++ " bounds for resource %d: %d %d %d %d vs %d %d\n", ++ __func__, rf.resource_id, rf.r.x, rf.r.y, ++ rf.r.width, rf.r.height, res->width, res->height); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ ++ pixman_region_init_rect(&flush_region, ++ rf.r.x, rf.r.y, rf.r.width, rf.r.height); ++ for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { ++ struct virtio_gpu_scanout *scanout; ++ pixman_region16_t region, finalregion; ++ pixman_box16_t *extents; ++ ++ if (!(res->scanout_bitmask & (1 << i))) { ++ continue; ++ } ++ scanout = &g->scanout[i]; ++ ++ pixman_region_init(&finalregion); ++ pixman_region_init_rect(®ion, scanout->x, scanout->y, ++ scanout->width, scanout->height); ++ ++ pixman_region_intersect(&finalregion, &flush_region, ®ion); ++ ++ extents = pixman_region_extents(&finalregion); ++ size_t width = extents->x2 - extents->x1; ++ size_t height = extents->y2 - extents->y1; ++ ++ if (vugbm_buffer_can_get_dmabuf_fd(&res->buffer)) { ++ VhostUserGpuMsg vmsg = { ++ .request = VHOST_USER_GPU_DMABUF_UPDATE, ++ .size = sizeof(VhostUserGpuUpdate), ++ .payload.update = (VhostUserGpuUpdate) { ++ .scanout_id = i, ++ .x = extents->x1, ++ .y = extents->y1, ++ .width = width, ++ .height = height, ++ } ++ }; ++ vg_send_msg(g, &vmsg, -1); ++ vg_wait_ok(g); ++ } else { ++ size_t bpp = ++ PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) / 8; ++ size_t size = width * height * bpp; ++ ++ void *p = g_malloc(VHOST_USER_GPU_HDR_SIZE + ++ sizeof(VhostUserGpuUpdate) + size); ++ VhostUserGpuMsg *msg = p; ++ msg->request = VHOST_USER_GPU_UPDATE; ++ msg->size = sizeof(VhostUserGpuUpdate) + size; ++ msg->payload.update = (VhostUserGpuUpdate) { ++ .scanout_id = i, ++ .x = extents->x1, ++ .y = extents->y1, ++ .width = width, ++ .height = height, ++ }; ++ pixman_image_t *i = ++ pixman_image_create_bits(pixman_image_get_format(res->image), ++ msg->payload.update.width, ++ msg->payload.update.height, ++ p + offsetof(VhostUserGpuMsg, ++ payload.update.data), ++ width * bpp); ++ pixman_image_composite(PIXMAN_OP_SRC, ++ res->image, NULL, i, ++ extents->x1, extents->y1, ++ 0, 0, 0, 0, ++ width, height); ++ pixman_image_unref(i); ++ vg_send_msg(g, msg, -1); ++ g_free(msg); ++ } ++ pixman_region_fini(®ion); ++ pixman_region_fini(&finalregion); ++ } ++ pixman_region_fini(&flush_region); ++} ++ ++static void ++vg_process_cmd(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) ++{ ++ switch (cmd->cmd_hdr.type) { ++ case VIRTIO_GPU_CMD_GET_DISPLAY_INFO: ++ vg_get_display_info(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_CREATE_2D: ++ vg_resource_create_2d(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_UNREF: ++ vg_resource_unref(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_FLUSH: ++ vg_resource_flush(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D: ++ vg_transfer_to_host_2d(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_SET_SCANOUT: ++ vg_set_scanout(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING: ++ vg_resource_attach_backing(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING: ++ vg_resource_detach_backing(vg, cmd); ++ break; ++ /* case VIRTIO_GPU_CMD_GET_EDID: */ ++ /* break */ ++ default: ++ g_warning("TODO handle ctrl %x\n", cmd->cmd_hdr.type); ++ cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; ++ break; ++ } ++ if (!cmd->finished) { ++ vg_ctrl_response_nodata(vg, cmd, cmd->error ? cmd->error : ++ VIRTIO_GPU_RESP_OK_NODATA); ++ } ++} ++ ++static void ++vg_handle_ctrl(VuDev *dev, int qidx) ++{ ++ VuGpu *vg = container_of(dev, VuGpu, dev.parent); ++ VuVirtq *vq = vu_get_queue(dev, qidx); ++ struct virtio_gpu_ctrl_command *cmd = NULL; ++ size_t len; ++ ++ for (;;) { ++ if (vg->wait_ok != 0) { ++ return; ++ } ++ ++ cmd = vu_queue_pop(dev, vq, sizeof(struct virtio_gpu_ctrl_command)); ++ if (!cmd) { ++ break; ++ } ++ cmd->vq = vq; ++ cmd->error = 0; ++ cmd->finished = false; ++ ++ len = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, ++ 0, &cmd->cmd_hdr, sizeof(cmd->cmd_hdr)); ++ if (len != sizeof(cmd->cmd_hdr)) { ++ g_warning("%s: command size incorrect %zu vs %zu\n", ++ __func__, len, sizeof(cmd->cmd_hdr)); ++ } ++ ++ virtio_gpu_ctrl_hdr_bswap(&cmd->cmd_hdr); ++ g_debug("%d %s\n", cmd->cmd_hdr.type, ++ vg_cmd_to_string(cmd->cmd_hdr.type)); ++ ++ if (vg->virgl) { ++ vg_virgl_process_cmd(vg, cmd); ++ } else { ++ vg_process_cmd(vg, cmd); ++ } ++ ++ if (!cmd->finished) { ++ QTAILQ_INSERT_TAIL(&vg->fenceq, cmd, next); ++ vg->inflight++; ++ } else { ++ g_free(cmd); ++ } ++ } ++} ++ ++static void ++update_cursor_data_simple(VuGpu *g, uint32_t resource_id, gpointer data) ++{ ++ struct virtio_gpu_simple_resource *res; ++ ++ res = virtio_gpu_find_resource(g, resource_id); ++ g_return_if_fail(res != NULL); ++ g_return_if_fail(pixman_image_get_width(res->image) == 64); ++ g_return_if_fail(pixman_image_get_height(res->image) == 64); ++ g_return_if_fail( ++ PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) == 32); ++ ++ memcpy(data, pixman_image_get_data(res->image), 64 * 64 * sizeof(uint32_t)); ++} ++ ++static void ++vg_process_cursor_cmd(VuGpu *g, struct virtio_gpu_update_cursor *cursor) ++{ ++ bool move = cursor->hdr.type != VIRTIO_GPU_CMD_MOVE_CURSOR; ++ ++ g_debug("%s move:%d\n", G_STRFUNC, move); ++ ++ if (move) { ++ VhostUserGpuMsg msg = { ++ .request = cursor->resource_id ? ++ VHOST_USER_GPU_CURSOR_POS : VHOST_USER_GPU_CURSOR_POS_HIDE, ++ .size = sizeof(VhostUserGpuCursorPos), ++ .payload.cursor_pos = { ++ .scanout_id = cursor->pos.scanout_id, ++ .x = cursor->pos.x, ++ .y = cursor->pos.y, ++ } ++ }; ++ vg_send_msg(g, &msg, -1); ++ } else { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_CURSOR_UPDATE, ++ .size = sizeof(VhostUserGpuCursorUpdate), ++ .payload.cursor_update = { ++ .pos = { ++ .scanout_id = cursor->pos.scanout_id, ++ .x = cursor->pos.x, ++ .y = cursor->pos.y, ++ }, ++ .hot_x = cursor->hot_x, ++ .hot_y = cursor->hot_y, ++ } ++ }; ++ if (g->virgl) { ++ vg_virgl_update_cursor_data(g, cursor->resource_id, ++ msg.payload.cursor_update.data); ++ } else { ++ update_cursor_data_simple(g, cursor->resource_id, ++ msg.payload.cursor_update.data); ++ } ++ vg_send_msg(g, &msg, -1); ++ } ++} ++ ++static void ++vg_handle_cursor(VuDev *dev, int qidx) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ VuVirtq *vq = vu_get_queue(dev, qidx); ++ VuVirtqElement *elem; ++ size_t len; ++ struct virtio_gpu_update_cursor cursor; ++ ++ for (;;) { ++ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); ++ if (!elem) { ++ break; ++ } ++ g_debug("cursor out:%d in:%d\n", elem->out_num, elem->in_num); ++ ++ len = iov_to_buf(elem->out_sg, elem->out_num, ++ 0, &cursor, sizeof(cursor)); ++ if (len != sizeof(cursor)) { ++ g_warning("%s: cursor size incorrect %zu vs %zu\n", ++ __func__, len, sizeof(cursor)); ++ } else { ++ virtio_gpu_bswap_32(&cursor, sizeof(cursor)); ++ vg_process_cursor_cmd(g, &cursor); ++ } ++ vu_queue_push(dev, vq, elem, 0); ++ vu_queue_notify(dev, vq); ++ g_free(elem); ++ } ++} ++ ++static void ++vg_panic(VuDev *dev, const char *msg) ++{ ++ g_critical("%s\n", msg); ++ exit(1); ++} ++ ++static void ++vg_queue_set_started(VuDev *dev, int qidx, bool started) ++{ ++ VuVirtq *vq = vu_get_queue(dev, qidx); ++ ++ g_debug("queue started %d:%d\n", qidx, started); ++ ++ switch (qidx) { ++ case 0: ++ vu_set_queue_handler(dev, vq, started ? vg_handle_ctrl : NULL); ++ break; ++ case 1: ++ vu_set_queue_handler(dev, vq, started ? vg_handle_cursor : NULL); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++set_gpu_protocol_features(VuGpu *g) ++{ ++ uint64_t u64; ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_GET_PROTOCOL_FEATURES ++ }; ++ ++ assert(g->wait_ok == 0); ++ vg_send_msg(g, &msg, -1); ++ if (!vg_recv_msg(g, msg.request, sizeof(u64), &u64)) { ++ return; ++ } ++ ++ msg = (VhostUserGpuMsg) { ++ .request = VHOST_USER_GPU_SET_PROTOCOL_FEATURES, ++ .size = sizeof(uint64_t), ++ .payload.u64 = 0 ++ }; ++ vg_send_msg(g, &msg, -1); ++} ++ ++static int ++vg_process_msg(VuDev *dev, VhostUserMsg *msg, int *do_reply) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ ++ switch (msg->request) { ++ case VHOST_USER_GPU_SET_SOCKET: { ++ g_return_val_if_fail(msg->fd_num == 1, 1); ++ g_return_val_if_fail(g->sock_fd == -1, 1); ++ g->sock_fd = msg->fds[0]; ++ set_gpu_protocol_features(g); ++ return 1; ++ } ++ default: ++ return 0; ++ } ++ ++ return 0; ++} ++ ++static uint64_t ++vg_get_features(VuDev *dev) ++{ ++ uint64_t features = 0; ++ ++ if (opt_virgl) { ++ features |= 1 << VIRTIO_GPU_F_VIRGL; ++ } ++ ++ return features; ++} ++ ++static void ++vg_set_features(VuDev *dev, uint64_t features) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ bool virgl = features & (1 << VIRTIO_GPU_F_VIRGL); ++ ++ if (virgl && !g->virgl_inited) { ++ if (!vg_virgl_init(g)) { ++ vg_panic(dev, "Failed to initialize virgl"); ++ } ++ g->virgl_inited = true; ++ } ++ ++ g->virgl = virgl; ++} ++ ++static int ++vg_get_config(VuDev *dev, uint8_t *config, uint32_t len) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ ++ g_return_val_if_fail(len <= sizeof(struct virtio_gpu_config), -1); ++ ++ if (opt_virgl) { ++ g->virtio_config.num_capsets = vg_virgl_get_num_capsets(); ++ } ++ ++ memcpy(config, &g->virtio_config, len); ++ ++ return 0; ++} ++ ++static int ++vg_set_config(VuDev *dev, const uint8_t *data, ++ uint32_t offset, uint32_t size, ++ uint32_t flags) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ struct virtio_gpu_config *config = (struct virtio_gpu_config *)data; ++ ++ if (config->events_clear) { ++ g->virtio_config.events_read &= ~config->events_clear; ++ } ++ ++ return 0; ++} ++ ++static const VuDevIface vuiface = { ++ .set_features = vg_set_features, ++ .get_features = vg_get_features, ++ .queue_set_started = vg_queue_set_started, ++ .process_msg = vg_process_msg, ++ .get_config = vg_get_config, ++ .set_config = vg_set_config, ++}; ++ ++static void ++vg_destroy(VuGpu *g) ++{ ++ struct virtio_gpu_simple_resource *res, *tmp; ++ ++ vug_deinit(&g->dev); ++ ++ vg_sock_fd_close(g); ++ ++ QTAILQ_FOREACH_SAFE(res, &g->reslist, next, tmp) { ++ vg_resource_destroy(g, res); ++ } ++ ++ vugbm_device_destroy(&g->gdev); ++} ++ ++static GOptionEntry entries[] = { ++ { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps, ++ "Print capabilities", NULL }, ++ { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum, ++ "Use inherited fd socket", "FDNUM" }, ++ { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path, ++ "Use UNIX socket path", "PATH" }, ++ { "render-node", 'r', 0, G_OPTION_ARG_FILENAME, &opt_render_node, ++ "Specify DRM render node", "PATH" }, ++ { "virgl", 'v', 0, G_OPTION_ARG_NONE, &opt_virgl, ++ "Turn virgl rendering on", NULL }, ++ { NULL, } ++}; ++ ++int ++main(int argc, char *argv[]) ++{ ++ GOptionContext *context; ++ GError *error = NULL; ++ GMainLoop *loop = NULL; ++ int fd; ++ VuGpu g = { .sock_fd = -1, .drm_rnode_fd = -1 }; ++ ++ QTAILQ_INIT(&g.reslist); ++ QTAILQ_INIT(&g.fenceq); ++ ++ context = g_option_context_new("QEMU vhost-user-gpu"); ++ g_option_context_add_main_entries(context, entries, NULL); ++ if (!g_option_context_parse(context, &argc, &argv, &error)) { ++ g_printerr("Option parsing failed: %s\n", error->message); ++ exit(EXIT_FAILURE); ++ } ++ g_option_context_free(context); ++ ++ if (opt_print_caps) { ++ g_print("{\n"); ++ g_print(" \"type\": \"gpu\",\n"); ++ g_print(" \"features\": [\n"); ++ g_print(" \"render-node\",\n"); ++ g_print(" \"virgl\"\n"); ++ g_print(" ]\n"); ++ g_print("}\n"); ++ exit(EXIT_SUCCESS); ++ } ++ ++ g.drm_rnode_fd = qemu_drm_rendernode_open(opt_render_node); ++ if (opt_render_node && g.drm_rnode_fd == -1) { ++ g_printerr("Failed to open DRM rendernode.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (g.drm_rnode_fd >= 0) { ++ if (!vugbm_device_init(&g.gdev, g.drm_rnode_fd)) { ++ g_warning("Failed to init DRM device, using fallback path"); ++ } ++ } ++ ++ if ((!!opt_socket_path + (opt_fdnum != -1)) != 1) { ++ g_printerr("Please specify either --fd or --socket-path\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (opt_socket_path) { ++ int lsock = unix_listen(opt_socket_path, &error_fatal); ++ if (lsock < 0) { ++ g_printerr("Failed to listen on %s.\n", opt_socket_path); ++ exit(EXIT_FAILURE); ++ } ++ fd = accept(lsock, NULL, NULL); ++ close(lsock); ++ } else { ++ fd = opt_fdnum; ++ } ++ if (fd == -1) { ++ g_printerr("Invalid vhost-user socket.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (!vug_init(&g.dev, VHOST_USER_GPU_MAX_QUEUES, fd, vg_panic, &vuiface)) { ++ g_printerr("Failed to initialize libvhost-user-glib.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ loop = g_main_loop_new(NULL, FALSE); ++ g_main_loop_run(loop); ++ g_main_loop_unref(loop); ++ ++ vg_destroy(&g); ++ if (g.drm_rnode_fd >= 0) { ++ close(g.drm_rnode_fd); ++ } ++ ++ return 0; ++} +-- +1.8.3.1 + diff --git a/kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch b/kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch new file mode 100755 index 0000000..32d5377 --- /dev/null +++ b/kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch @@ -0,0 +1,60 @@ +From 6f1ebcfdb92d12ef2caae0b63a3a380265cba1fa Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:46 -0400 +Subject: [PATCH 8/9] cadence_gem: switch to use qemu_receive_packet() for + loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-9-jmaloy@redhat.com> +Patchwork-id: 101793 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 8/9] cadence_gem: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Alexander Bulekov +Signed-off-by: Jason Wang + +(cherry picked from commit e73adfbeec9d4e008630c814759052ed945c3fed) +Conflict: upstream commit 24d62fd5028e ("net: cadence_gem: Move tx/rx +packet buffert to CadenceGEMState") is missing in this version, so +we stick to using the original stack variable tx_packet in the calls. + +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/cadence_gem.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c +index b8be73dc55..be7c91123b 100644 +--- a/hw/net/cadence_gem.c ++++ b/hw/net/cadence_gem.c +@@ -1225,8 +1225,8 @@ static void gem_transmit(CadenceGEMState *s) + /* Send the packet somewhere */ + if (s->phy_loop || (s->regs[GEM_NWCTRL] & + GEM_NWCTRL_LOCALLOOP)) { +- gem_receive(qemu_get_queue(s->nic), tx_packet, +- total_bytes); ++ qemu_receive_packet(qemu_get_queue(s->nic), tx_packet, ++ total_bytes); + } else { + qemu_send_packet(qemu_get_queue(s->nic), tx_packet, + total_bytes); +-- +2.27.0 + diff --git a/kvm-compat-disable-edid-for-virtio-gpu-ccw.patch b/kvm-compat-disable-edid-for-virtio-gpu-ccw.patch new file mode 100755 index 0000000..e000534 --- /dev/null +++ b/kvm-compat-disable-edid-for-virtio-gpu-ccw.patch @@ -0,0 +1,50 @@ +From 8f9f4d8d52ebb7878543ac0b84cc372477041e33 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Wed, 1 Apr 2020 16:13:50 -0400 +Subject: [PATCH 2/2] compat: disable 'edid' for virtio-gpu-ccw + +RH-Author: Cornelia Huck +Message-id: <20200401161350.20462-1-cohuck@redhat.com> +Patchwork-id: 94523 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2] compat: disable 'edid' for virtio-gpu-ccw +Bugzilla: 1816793 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Markus Armbruster +RH-Acked-by: Dr. David Alan Gilbert + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816793 +Branch: rhel-av-8.2.1 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27629804 +Upstream: downstream only +Tested: verified that for a virtio-gpu-ccw device 'edid' is false with + a s390-ccw-virtio-rhel7.6.0 machine and true with a + s390-ccw-virtio-rhel8.2.0 (s390x does not have the 8.0 or 8.1 + machine types) + +hw_compat_rhel_8_0 copied the original upstream version of +disabling 'edid' for virtio-gpu-pci only (not following later +changes). Switch it to virtio-gpu-device, following upstream +02501fc39381 ("compat: disable edid on correct virtio-gpu device"). + +Signed-off-by: Cornelia Huck +Signed-off-by: Jon Maloy +--- + hw/core/machine.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index e0e0eec8bf..5a025d1af2 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -72,7 +72,7 @@ GlobalProperty hw_compat_rhel_8_0[] = { + /* hw_compat_rhel_8_0 from hw_compat_4_0 */ + { "virtio-vga", "edid", "false" }, + /* hw_compat_rhel_8_0 from hw_compat_4_0 */ +- { "virtio-gpu-pci", "edid", "false" }, ++ { "virtio-gpu-device", "edid", "false" }, + /* hw_compat_rhel_8_0 from hw_compat_4_0 */ + { "virtio-device", "use-started", "false" }, + /* hw_compat_rhel_8_0 from hw_compat_3_1 - that was added in 4.1 */ +-- +2.18.2 + diff --git a/kvm-config-enable-VFIO_CCW.patch b/kvm-config-enable-VFIO_CCW.patch new file mode 100755 index 0000000..44af9cf --- /dev/null +++ b/kvm-config-enable-VFIO_CCW.patch @@ -0,0 +1,39 @@ +From f3e80771c921560a58c30020781fa01a54be8eb0 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:43 -0400 +Subject: [PATCH 09/12] config: enable VFIO_CCW + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-10-cohuck@redhat.com> +Patchwork-id: 97699 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 9/9] config: enable VFIO_CCW +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +Enable vfio-ccw in RHEL builds. + +Upstream: n/a + +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + default-configs/s390x-rh-devices.mak | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/default-configs/s390x-rh-devices.mak b/default-configs/s390x-rh-devices.mak +index c3c73fe752..08a15f3e01 100644 +--- a/default-configs/s390x-rh-devices.mak ++++ b/default-configs/s390x-rh-devices.mak +@@ -9,6 +9,7 @@ CONFIG_SCSI=y + CONFIG_TERMINAL3270=y + CONFIG_VFIO=y + CONFIG_VFIO_AP=y ++CONFIG_VFIO_CCW=y + CONFIG_VFIO_PCI=y + CONFIG_VHOST_USER=y + CONFIG_VIRTIO_CCW=y +-- +2.27.0 + diff --git a/kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch b/kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch new file mode 100755 index 0000000..4212f1c --- /dev/null +++ b/kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch @@ -0,0 +1,134 @@ +From 548de8acbf0137b6e49a14b63682badfff037d23 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:44 +0100 +Subject: [PATCH 073/116] contrib/libvhost-user: Protect slave fd with mutex +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-70-dgilbert@redhat.com> +Patchwork-id: 93523 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 069/112] contrib/libvhost-user: Protect slave fd with mutex +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +In future patches we'll be performing commands on the slave-fd driven +by commands on queues, since those queues will be driven by individual +threads we need to make sure they don't attempt to use the slave-fd +for multiple commands in parallel. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c25c02b9e6a196be87a818f459c426556b24770d) +Signed-off-by: Miroslav Rezanina +--- + contrib/libvhost-user/libvhost-user.c | 24 ++++++++++++++++++++---- + contrib/libvhost-user/libvhost-user.h | 3 +++ + 2 files changed, 23 insertions(+), 4 deletions(-) + +diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c +index ec27b78..63e4106 100644 +--- a/contrib/libvhost-user/libvhost-user.c ++++ b/contrib/libvhost-user/libvhost-user.c +@@ -392,26 +392,37 @@ vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) + return vu_message_write(dev, conn_fd, vmsg); + } + ++/* ++ * Processes a reply on the slave channel. ++ * Entered with slave_mutex held and releases it before exit. ++ * Returns true on success. ++ */ + static bool + vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) + { + VhostUserMsg msg_reply; ++ bool result = false; + + if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { +- return true; ++ result = true; ++ goto out; + } + + if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) { +- return false; ++ goto out; + } + + if (msg_reply.request != vmsg->request) { + DPRINT("Received unexpected msg type. Expected %d received %d", + vmsg->request, msg_reply.request); +- return false; ++ goto out; + } + +- return msg_reply.payload.u64 == 0; ++ result = msg_reply.payload.u64 == 0; ++ ++out: ++ pthread_mutex_unlock(&dev->slave_mutex); ++ return result; + } + + /* Kick the log_call_fd if required. */ +@@ -1105,10 +1116,13 @@ bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, + return false; + } + ++ pthread_mutex_lock(&dev->slave_mutex); + if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { ++ pthread_mutex_unlock(&dev->slave_mutex); + return false; + } + ++ /* Also unlocks the slave_mutex */ + return vu_process_message_reply(dev, &vmsg); + } + +@@ -1628,6 +1642,7 @@ vu_deinit(VuDev *dev) + close(dev->slave_fd); + dev->slave_fd = -1; + } ++ pthread_mutex_destroy(&dev->slave_mutex); + + if (dev->sock != -1) { + close(dev->sock); +@@ -1663,6 +1678,7 @@ vu_init(VuDev *dev, + dev->remove_watch = remove_watch; + dev->iface = iface; + dev->log_call_fd = -1; ++ pthread_mutex_init(&dev->slave_mutex, NULL); + dev->slave_fd = -1; + dev->max_queues = max_queues; + +diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h +index 46b6007..1844b6f 100644 +--- a/contrib/libvhost-user/libvhost-user.h ++++ b/contrib/libvhost-user/libvhost-user.h +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include "standard-headers/linux/virtio_ring.h" + + /* Based on qemu/hw/virtio/vhost-user.c */ +@@ -355,6 +356,8 @@ struct VuDev { + VuVirtq *vq; + VuDevInflightInfo inflight_info; + int log_call_fd; ++ /* Must be held while using slave_fd */ ++ pthread_mutex_t slave_mutex; + int slave_fd; + uint64_t log_size; + uint8_t *log_table; +-- +1.8.3.1 + diff --git a/kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch b/kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch new file mode 100755 index 0000000..891b866 --- /dev/null +++ b/kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch @@ -0,0 +1,98 @@ +From 043decff5812c1f46ed44dd0f82099e3b8bb6a28 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Sun, 31 May 2020 16:40:35 +0100 +Subject: [PATCH 7/7] crypto.c: cleanup created file when + block_crypto_co_create_opts_luks fails + +RH-Author: Maxim Levitsky +Message-id: <20200531164035.34188-4-mlevitsk@redhat.com> +Patchwork-id: 97060 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 3/3] crypto.c: cleanup created file when block_crypto_co_create_opts_luks fails +Bugzilla: 1827630 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: John Snow +RH-Acked-by: Eric Blake + +From: Daniel Henrique Barboza + +When using a non-UTF8 secret to create a volume using qemu-img, the +following error happens: + +$ qemu-img create -f luks --object secret,id=vol_1_encrypt0,file=vol_resize_pool.vol_1.secret.qzVQrI -o key-secret=vol_1_encrypt0 /var/tmp/pool_target/vol_1 10240K + +Formatting '/var/tmp/pool_target/vol_1', fmt=luks size=10485760 key-secret=vol_1_encrypt0 +qemu-img: /var/tmp/pool_target/vol_1: Data from secret vol_1_encrypt0 is not valid UTF-8 + +However, the created file '/var/tmp/pool_target/vol_1' is left behind in the +file system after the failure. This behavior can be observed when creating +the volume using Libvirt, via 'virsh vol-create', and then getting "volume +target path already exist" errors when trying to re-create the volume. + +The volume file is created inside block_crypto_co_create_opts_luks(), in +block/crypto.c. If the bdrv_create_file() call is successful but any +succeeding step fails*, the existing 'fail' label does not take into +account the created file, leaving it behind. + +This patch changes block_crypto_co_create_opts_luks() to delete +'filename' in case of failure. A failure in this point means that +the volume is now truncated/corrupted, so even if 'filename' was an +existing volume before calling qemu-img, it is now unusable. Deleting +the file it is not much worse than leaving it in the filesystem in +this scenario, and we don't have to deal with checking the file +pre-existence in the code. + +* in our case, block_crypto_co_create_generic calls qcrypto_block_create, +which calls qcrypto_block_luks_create, and this function fails when +calling qcrypto_secret_lookup_as_utf8. + +Reported-by: Srikanth Aithal +Suggested-by: Kevin Wolf +Signed-off-by: Daniel Henrique Barboza +Message-Id: <20200130213907.2830642-4-danielhb413@gmail.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 1bba30da24e1124ceeb0693c81382a0d77e20ca5) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/crypto.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/block/crypto.c b/block/crypto.c +index 970d463..5e3b15c 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -30,6 +30,7 @@ + #include "qapi/error.h" + #include "qemu/module.h" + #include "qemu/option.h" ++#include "qemu/cutils.h" + #include "crypto.h" + + typedef struct BlockCrypto BlockCrypto; +@@ -597,6 +598,23 @@ static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv, + + ret = 0; + fail: ++ /* ++ * If an error occurred, delete 'filename'. Even if the file existed ++ * beforehand, it has been truncated and corrupted in the process. ++ */ ++ if (ret && bs) { ++ Error *local_delete_err = NULL; ++ int r_del = bdrv_co_delete_file(bs, &local_delete_err); ++ /* ++ * ENOTSUP will happen if the block driver doesn't support ++ * the 'bdrv_co_delete_file' interface. This is a predictable ++ * scenario and shouldn't be reported back to the user. ++ */ ++ if ((r_del < 0) && (r_del != -ENOTSUP)) { ++ error_report_err(local_delete_err); ++ } ++ } ++ + bdrv_unref(bs); + qapi_free_QCryptoBlockCreateOptions(create_opts); + qobject_unref(cryptoopts); +-- +1.8.3.1 + diff --git a/kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch b/kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch new file mode 100755 index 0000000..a6177c6 --- /dev/null +++ b/kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch @@ -0,0 +1,56 @@ +From f01178897c8f5ff98692a22059dd65e35677eaa3 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Mon, 10 Feb 2020 17:33:58 +0000 +Subject: [PATCH 18/18] docs/arm-cpu-features: Make kvm-no-adjvtime comment + clearer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200210173358.16896-3-drjones@redhat.com> +Patchwork-id: 93772 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] docs/arm-cpu-features: Make kvm-no-adjvtime comment clearer +Bugzilla: 1801320 +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan +RH-Acked-by: Philippe Mathieu-Daudé + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1801320 + +Author: Philippe Mathieu-Daudé +Date: Fri, 07 Feb 2020 14:04:28 +0000 + + docs/arm-cpu-features: Make kvm-no-adjvtime comment clearer + + The bold text sounds like 'knock knock'. Only bolding the + second 'not' makes it easier to read. + + Fixes: dea101a1ae + Signed-off-by: Philippe Mathieu-Daudé + Reviewed-by: Andrew Jones + Message-id: 20200206225148.23923-1-philmd@redhat.com + Signed-off-by: Peter Maydell + +(cherry picked from commit fa3236a970b6ea5be3fa3ad258f1a75920ca1ebb) +Signed-off-by: Danilo C. L. de Paula +--- + docs/arm-cpu-features.rst | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/docs/arm-cpu-features.rst b/docs/arm-cpu-features.rst +index 45d1eb6..48d5054 100644 +--- a/docs/arm-cpu-features.rst ++++ b/docs/arm-cpu-features.rst +@@ -185,7 +185,7 @@ the list of KVM VCPU features and their descriptions. + + kvm-no-adjvtime By default kvm-no-adjvtime is disabled. This + means that by default the virtual time +- adjustment is enabled (vtime is *not not* ++ adjustment is enabled (vtime is not *not* + adjusted). + + When virtual time adjustment is enabled each +-- +1.8.3.1 + diff --git a/kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch b/kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch new file mode 100755 index 0000000..77e99eb --- /dev/null +++ b/kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch @@ -0,0 +1,53 @@ +From a6f0bef82cdd84844a06dac1e6d279d95824d827 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:41 -0400 +Subject: [PATCH 3/9] dp8393x: switch to use qemu_receive_packet() for loopback + packet +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-4-jmaloy@redhat.com> +Patchwork-id: 101789 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 3/9] dp8393x: switch to use qemu_receive_packet() for loopback packet +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé + +(cherry picked from commit 331d2ac9ea307c990dc86e6493e8f0c48d14bb33) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/dp8393x.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c +index 3d991af163..6d55b5de64 100644 +--- a/hw/net/dp8393x.c ++++ b/hw/net/dp8393x.c +@@ -482,7 +482,7 @@ static void dp8393x_do_transmit_packets(dp8393xState *s) + s->regs[SONIC_TCR] |= SONIC_TCR_CRSL; + if (nc->info->can_receive(nc)) { + s->loopback_packet = 1; +- nc->info->receive(nc, s->tx_buffer, tx_len); ++ qemu_receive_packet(nc, s->tx_buffer, tx_len); + } + } else { + /* Transmit packet */ +-- +2.27.0 + diff --git a/kvm-e1000-fail-early-for-evil-descriptor.patch b/kvm-e1000-fail-early-for-evil-descriptor.patch new file mode 100755 index 0000000..e599b7c --- /dev/null +++ b/kvm-e1000-fail-early-for-evil-descriptor.patch @@ -0,0 +1,65 @@ +From 7bd3000cf22a91e6bc6afc1e7adbf0ae1b731104 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 22:45:17 -0400 +Subject: [PATCH 2/5] e1000: fail early for evil descriptor + +RH-Author: Jon Maloy +Message-id: <20210413224517.3841507-2-jmaloy@redhat.com> +Patchwork-id: 101473 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] e1000: fail early for evil descriptor +Bugzilla: 1930092 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Stefan Hajnoczi + +From: Jason Wang + +During procss_tx_desc(), driver can try to chain data descriptor with +legacy descriptor, when will lead underflow for the following +calculation in process_tx_desc() for bytes: + + if (tp->size + bytes > msh) + bytes = msh - tp->size; + +This will lead a infinite loop. So check and fail early if tp->size if +greater or equal to msh. + +Reported-by: Alexander Bulekov +Reported-by: Cheolwoo Myung +Reported-by: Ruhr-University Bochum +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Signed-off-by: Jason Wang + +(cherry picked from commit 3de46e6fc489c52c9431a8a832ad8170a7569bd8) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/e1000.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index fc73fdd6fa..fe56bccd52 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -671,6 +671,9 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) + msh = tp->tso_props.hdr_len + tp->tso_props.mss; + do { + bytes = split_size; ++ if (tp->size >= msh) { ++ goto eop; ++ } + if (tp->size + bytes > msh) + bytes = msh - tp->size; + +@@ -696,6 +699,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) + tp->size += split_size; + } + ++eop: + if (!(txd_lower & E1000_TXD_CMD_EOP)) + return; + if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) { +-- +2.27.0 + diff --git a/kvm-e1000-fix-tx-re-entrancy-problem.patch b/kvm-e1000-fix-tx-re-entrancy-problem.patch new file mode 100755 index 0000000..55aae5e --- /dev/null +++ b/kvm-e1000-fix-tx-re-entrancy-problem.patch @@ -0,0 +1,71 @@ +From fc0bca7bd2685b8f8e3c37f19ce74967870ef952 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 21 Oct 2021 12:10:47 -0400 +Subject: [PATCH 2/2] e1000: fix tx re-entrancy problem +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 73: e1000: fix tx re-entrancy problem +RH-Commit: [1/1] 3088ea275ddcee1ba0d47f7cff195af3e256f15f (jmaloy/qemu-kvm) +RH-Bugzilla: 2025011 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier + +The fact that the MMIO handler is not re-entrant causes an infinite +loop under certain conditions: + +Guest write to TDT -> Loopback -> RX (DMA to TDT) -> TX + +We now eliminate the effect of this problem locally in e1000, by adding +a boolean in struct E1000State indicating when the TX side is busy. This +will cause any entering new call to return early instead of interfering +with the ongoing work, and eliminates any risk of looping. + +This is intended to address CVE-2021-20257. + +Signed-off-by: Jon Maloy +Signed-off-by: Jason Wang +(cherry picked from commit 25ddb946e6301f42cff3094ea1c25fb78813e7e9) +Signed-off-by: Jon Maloy +--- + hw/net/e1000.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index 8680b7d46b..1963a5b243 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -105,6 +105,7 @@ typedef struct E1000State_st { + e1000x_txd_props props; + e1000x_txd_props tso_props; + uint16_t tso_frames; ++ bool busy; + } tx; + + struct { +@@ -749,6 +750,11 @@ start_xmit(E1000State *s) + return; + } + ++ if (s->tx.busy) { ++ return; ++ } ++ s->tx.busy = true; ++ + while (s->mac_reg[TDH] != s->mac_reg[TDT]) { + base = tx_desc_base(s) + + sizeof(struct e1000_tx_desc) * s->mac_reg[TDH]; +@@ -775,6 +781,7 @@ start_xmit(E1000State *s) + break; + } + } ++ s->tx.busy = false; + set_ics(s, 0, cause); + } + +-- +2.27.0 + diff --git a/kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch b/kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch new file mode 100755 index 0000000..05ff372 --- /dev/null +++ b/kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch @@ -0,0 +1,52 @@ +From 128b97f6049144af3c1a41ceb8e8583419edcd69 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:40 -0400 +Subject: [PATCH 2/9] e1000: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-3-jmaloy@redhat.com> +Patchwork-id: 101784 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 2/9] e1000: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 1caff0340f49c93d535c6558a5138d20d475315c) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/e1000.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index fe56bccd52..8680b7d46b 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -547,7 +547,7 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) + + NetClientState *nc = qemu_get_queue(s->nic); + if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { +- nc->info->receive(nc, buf, size); ++ qemu_receive_packet(nc, buf, size); + } else { + qemu_send_packet(nc, buf, size); + } +-- +2.27.0 + diff --git a/kvm-enable-ramfb.patch b/kvm-enable-ramfb.patch new file mode 100755 index 0000000..fa2fe11 --- /dev/null +++ b/kvm-enable-ramfb.patch @@ -0,0 +1,72 @@ +From 441128e2f13a56d4949b70971edd2f6902772959 Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann +Date: Wed, 3 Jun 2020 15:15:56 +0100 +Subject: [PATCH 01/17] enable ramfb + +RH-Author: Gerd Hoffmann +Message-id: <20200603151556.1195-2-kraxel@redhat.com> +Patchwork-id: 97097 +O-Subject: [RHEL-AV-8.2.0.z qemu-kvm PATCH 1/1] enable ramfb +Bugzilla: 1841068 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefan Hajnoczi + +--- + hw/vfio/pci.c | 5 ----- + hw/display/Makefile.objs | 5 ++--- + 2 files changed, 2 insertions(+), 8 deletions(-) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/display/Makefile.objs | 5 ++--- + hw/vfio/pci.c | 5 ----- + 2 files changed, 2 insertions(+), 8 deletions(-) + +diff --git a/hw/display/Makefile.objs b/hw/display/Makefile.objs +index 3d0cda1..f2182e3 100644 +--- a/hw/display/Makefile.objs ++++ b/hw/display/Makefile.objs +@@ -1,9 +1,8 @@ + common-obj-$(CONFIG_DDC) += i2c-ddc.o + common-obj-$(CONFIG_EDID) += edid-generate.o edid-region.o + +-# Disabled for Red Hat Enterprise Linux +-#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o +-#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o ++common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o ++common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o + + common-obj-$(CONFIG_ADS7846) += ads7846.o + common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index d717520..f191904 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3249,7 +3249,6 @@ static const TypeInfo vfio_pci_dev_info = { + }, + }; + +-#if 0 /* Disabled for Red Hat Enterprise Linux */ + static Property vfio_pci_dev_nohotplug_properties[] = { + DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), + DEFINE_PROP_END_OF_LIST(), +@@ -3269,15 +3268,11 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { + .instance_size = sizeof(VFIOPCIDevice), + .class_init = vfio_pci_nohotplug_dev_class_init, + }; +-#endif + + static void register_vfio_pci_dev_type(void) + { + type_register_static(&vfio_pci_dev_info); +- +-#if 0 /* Disabled for Red Hat Enterprise Linux */ + type_register_static(&vfio_pci_nohotplug_dev_info); +-#endif + } + + type_init(register_vfio_pci_dev_type) +-- +1.8.3.1 + diff --git a/kvm-error-Document-Error-API-usage-rules.patch b/kvm-error-Document-Error-API-usage-rules.patch new file mode 100755 index 0000000..fb9f1b0 --- /dev/null +++ b/kvm-error-Document-Error-API-usage-rules.patch @@ -0,0 +1,154 @@ +From b2ac3e491eb7f18a421e2b1132e527d484681767 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:09 -0500 +Subject: [PATCH 08/14] error: Document Error API usage rules +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-5-marcandre.lureau@redhat.com> +Patchwork-id: 100477 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 04/10] error: Document Error API usage rules +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Markus Armbruster + +This merely codifies existing practice, with one exception: the rule +advising against returning void, where existing practice is mixed. + +When the Error API was created, we adopted the (unwritten) rule to +return void when the function returns no useful value on success, +unlike GError, which recommends to return true on success and false on +error then. + +When a function returns a distinct error value, say false, a checked +call that passes the error up looks like + + if (!frobnicate(..., errp)) { + handle the error... + } + +When it returns void, we need + + Error *err = NULL; + + frobnicate(..., &err); + if (err) { + handle the error... + error_propagate(errp, err); + } + +Not only is this more verbose, it also creates an Error object even +when @errp is null, &error_abort or &error_fatal. + +People got tired of the additional boilerplate, and started to ignore +the unwritten rule. The result is confusion among developers about +the preferred usage. + +Make the rule advising against returning void official by putting it +in writing. This will hopefully reduce confusion. + +Update the examples accordingly. + +The remainder of this series will update a substantial amount of code +to honor the rule. + +Signed-off-by: Markus Armbruster +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Greg Kurz +Message-Id: <20200707160613.848843-4-armbru@redhat.com> + +(cherry picked from commit e3fe3988d7851cac30abffae06d2f555ff7bee62) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 52 +++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 46 insertions(+), 6 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 3351fe76368..08d48e74836 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -15,6 +15,33 @@ + /* + * Error reporting system loosely patterned after Glib's GError. + * ++ * = Rules = ++ * ++ * - Functions that use Error to report errors have an Error **errp ++ * parameter. It should be the last parameter, except for functions ++ * taking variable arguments. ++ * ++ * - You may pass NULL to not receive the error, &error_abort to abort ++ * on error, &error_fatal to exit(1) on error, or a pointer to a ++ * variable containing NULL to receive the error. ++ * ++ * - Separation of concerns: the function is responsible for detecting ++ * errors and failing cleanly; handling the error is its caller's ++ * job. Since the value of @errp is about handling the error, the ++ * function should not examine it. ++ * ++ * - On success, the function should not touch *errp. On failure, it ++ * should set a new error, e.g. with error_setg(errp, ...), or ++ * propagate an existing one, e.g. with error_propagate(errp, ...). ++ * ++ * - Whenever practical, also return a value that indicates success / ++ * failure. This can make the error checking more concise, and can ++ * avoid useless error object creation and destruction. Note that ++ * we still have many functions returning void. We recommend ++ * • bool-valued functions return true on success / false on failure, ++ * • pointer-valued functions return non-null / null pointer, and ++ * • integer-valued functions return non-negative / negative. ++ * + * = Creating errors = + * + * Create an error: +@@ -95,14 +122,13 @@ + * Create a new error and pass it to the caller: + * error_setg(errp, "situation normal, all fouled up"); + * +- * Call a function and receive an error from it: +- * Error *err = NULL; +- * foo(arg, &err); +- * if (err) { ++ * Call a function, receive an error from it, and pass it to the caller ++ * - when the function returns a value that indicates failure, say ++ * false: ++ * if (!foo(arg, errp)) { + * handle the error... + * } +- * +- * Receive an error and pass it on to the caller: ++ * - when it does not, say because it is a void function: + * Error *err = NULL; + * foo(arg, &err); + * if (err) { +@@ -120,6 +146,20 @@ + * foo(arg, errp); + * for readability. + * ++ * Receive an error, and handle it locally ++ * - when the function returns a value that indicates failure, say ++ * false: ++ * Error *err = NULL; ++ * if (!foo(arg, &err)) { ++ * handle the error... ++ * } ++ * - when it does not, say because it is a void function: ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * if (err) { ++ * handle the error... ++ * } ++ * + * Receive and accumulate multiple errors (first one wins): + * Error *err = NULL, *local_err = NULL; + * foo(arg, &err); +-- +2.27.0 + diff --git a/kvm-error-Fix-examples-in-error.h-s-big-comment.patch b/kvm-error-Fix-examples-in-error.h-s-big-comment.patch new file mode 100755 index 0000000..ee14eb5 --- /dev/null +++ b/kvm-error-Fix-examples-in-error.h-s-big-comment.patch @@ -0,0 +1,85 @@ +From fe7dd779a9674dc54ffe296247ae6559f2b55b22 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:07 -0500 +Subject: [PATCH 06/14] error: Fix examples in error.h's big comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-3-marcandre.lureau@redhat.com> +Patchwork-id: 100473 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 02/10] error: Fix examples in error.h's big comment +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Markus Armbruster + +Mark a bad example more clearly. Fix the error_propagate_prepend() +example. Add a missing declaration and a second error pileup example. + +Signed-off-by: Markus Armbruster +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Greg Kurz +Message-Id: <20200707160613.848843-2-armbru@redhat.com> + +(cherry picked from commit 47ff5ac81e8bb3096500de7b132051691d533d36) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 3f95141a01a..83c38f9a188 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -24,7 +24,7 @@ + * "charm, top, bottom.\n"); + * + * Do *not* contract this to +- * error_setg(&err, "invalid quark\n" ++ * error_setg(&err, "invalid quark\n" // WRONG! + * "Valid quarks are up, down, strange, charm, top, bottom."); + * + * Report an error to the current monitor if we have one, else stderr: +@@ -52,7 +52,8 @@ + * where Error **errp is a parameter, by convention the last one. + * + * Pass an existing error to the caller with the message modified: +- * error_propagate_prepend(errp, err); ++ * error_propagate_prepend(errp, err, ++ * "Could not frobnicate '%s': ", name); + * + * Avoid + * error_propagate(errp, err); +@@ -108,12 +109,23 @@ + * } + * + * Do *not* "optimize" this to ++ * Error *err = NULL; + * foo(arg, &err); + * bar(arg, &err); // WRONG! + * if (err) { + * handle the error... + * } + * because this may pass a non-null err to bar(). ++ * ++ * Likewise, do *not* ++ * Error *err = NULL; ++ * if (cond1) { ++ * error_setg(&err, ...); ++ * } ++ * if (cond2) { ++ * error_setg(&err, ...); // WRONG! ++ * } ++ * because this may pass a non-null err to error_setg(). + */ + + #ifndef ERROR_H +-- +2.27.0 + diff --git a/kvm-error-Improve-error.h-s-big-comment.patch b/kvm-error-Improve-error.h-s-big-comment.patch new file mode 100755 index 0000000..0ad4367 --- /dev/null +++ b/kvm-error-Improve-error.h-s-big-comment.patch @@ -0,0 +1,146 @@ +From 439c11850165fd838e367aa6d4fff4af951a5bd9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:08 -0500 +Subject: [PATCH 07/14] error: Improve error.h's big comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-4-marcandre.lureau@redhat.com> +Patchwork-id: 100474 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 03/10] error: Improve error.h's big comment +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Markus Armbruster + +Add headlines to the big comment. + +Explain examples for NULL, &error_abort and &error_fatal argument +better. + +Tweak rationale for error_propagate_prepend(). + +Signed-off-by: Markus Armbruster +Message-Id: <20200707160613.848843-3-armbru@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Greg Kurz + +(cherry picked from commit 9aac7d486cc792191c25c30851f501624b0c2751) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 51 +++++++++++++++++++++++++++++++------------- + 1 file changed, 36 insertions(+), 15 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 83c38f9a188..3351fe76368 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -15,6 +15,8 @@ + /* + * Error reporting system loosely patterned after Glib's GError. + * ++ * = Creating errors = ++ * + * Create an error: + * error_setg(&err, "situation normal, all fouled up"); + * +@@ -27,6 +29,8 @@ + * error_setg(&err, "invalid quark\n" // WRONG! + * "Valid quarks are up, down, strange, charm, top, bottom."); + * ++ * = Reporting and destroying errors = ++ * + * Report an error to the current monitor if we have one, else stderr: + * error_report_err(err); + * This frees the error object. +@@ -40,6 +44,30 @@ + * error_free(err); + * Note that this loses hints added with error_append_hint(). + * ++ * Call a function ignoring errors: ++ * foo(arg, NULL); ++ * This is more concise than ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * error_free(err); // don't do this ++ * ++ * Call a function aborting on errors: ++ * foo(arg, &error_abort); ++ * This is more concise and fails more nicely than ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * assert(!err); // don't do this ++ * ++ * Call a function treating errors as fatal: ++ * foo(arg, &error_fatal); ++ * This is more concise than ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * if (err) { // don't do this ++ * error_report_err(err); ++ * exit(1); ++ * } ++ * + * Handle an error without reporting it (just for completeness): + * error_free(err); + * +@@ -47,6 +75,11 @@ + * reporting it (primarily useful in testsuites): + * error_free_or_abort(&err); + * ++ * = Passing errors around = ++ * ++ * Errors get passed to the caller through the conventional @errp ++ * parameter. ++ * + * Pass an existing error to the caller: + * error_propagate(errp, err); + * where Error **errp is a parameter, by convention the last one. +@@ -54,11 +87,10 @@ + * Pass an existing error to the caller with the message modified: + * error_propagate_prepend(errp, err, + * "Could not frobnicate '%s': ", name); +- * +- * Avoid +- * error_propagate(errp, err); ++ * This is more concise than ++ * error_propagate(errp, err); // don't do this + * error_prepend(errp, "Could not frobnicate '%s': ", name); +- * because this fails to prepend when @errp is &error_fatal. ++ * and works even when @errp is &error_fatal. + * + * Create a new error and pass it to the caller: + * error_setg(errp, "situation normal, all fouled up"); +@@ -70,15 +102,6 @@ + * handle the error... + * } + * +- * Call a function ignoring errors: +- * foo(arg, NULL); +- * +- * Call a function aborting on errors: +- * foo(arg, &error_abort); +- * +- * Call a function treating errors as fatal: +- * foo(arg, &error_fatal); +- * + * Receive an error and pass it on to the caller: + * Error *err = NULL; + * foo(arg, &err); +@@ -86,8 +109,6 @@ + * handle the error... + * error_propagate(errp, err); + * } +- * where Error **errp is a parameter, by convention the last one. +- * + * Do *not* "optimize" this to + * foo(arg, errp); + * if (*errp) { // WRONG! +-- +2.27.0 + diff --git a/kvm-error-New-macro-ERRP_GUARD.patch b/kvm-error-New-macro-ERRP_GUARD.patch new file mode 100755 index 0000000..d67ad7c --- /dev/null +++ b/kvm-error-New-macro-ERRP_GUARD.patch @@ -0,0 +1,305 @@ +From 46c3298774b976cc6a1cd834751e644fb482b08e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:10 -0500 +Subject: [PATCH 09/14] error: New macro ERRP_GUARD() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-6-marcandre.lureau@redhat.com> +Patchwork-id: 100476 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 05/10] error: New macro ERRP_GUARD() +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Vladimir Sementsov-Ogievskiy + +Introduce a new ERRP_GUARD() macro, to be used at start of functions +with an errp OUT parameter. + +It has three goals: + +1. Fix issue with error_fatal and error_prepend/error_append_hint: the +user can't see this additional information, because exit() happens in +error_setg earlier than information is added. [Reported by Greg Kurz] + +2. Fix issue with error_abort and error_propagate: when we wrap +error_abort by local_err+error_propagate, the resulting coredump will +refer to error_propagate and not to the place where error happened. +(the macro itself doesn't fix the issue, but it allows us to [3.] drop +the local_err+error_propagate pattern, which will definitely fix the +issue) [Reported by Kevin Wolf] + +3. Drop local_err+error_propagate pattern, which is used to workaround +void functions with errp parameter, when caller wants to know resulting +status. (Note: actually these functions could be merely updated to +return int error code). + +To achieve these goals, later patches will add invocations +of this macro at the start of functions with either use +error_prepend/error_append_hint (solving 1) or which use +local_err+error_propagate to check errors, switching those +functions to use *errp instead (solving 2 and 3). + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Paul Durrant +Reviewed-by: Greg Kurz +Reviewed-by: Eric Blake +[Merge comments properly with recent commit "error: Document Error API +usage rules", and edit for clarity. Put ERRP_AUTO_PROPAGATE() before +its helpers, and touch up style. Tweak commit message.] +Signed-off-by: Markus Armbruster +Message-Id: <20200707165037.1026246-2-armbru@redhat.com> + +(cherry picked from commit ae7c80a7bd73685437bf6ba9d7c26098351f4166) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 158 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 139 insertions(+), 19 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 08d48e74836..e658790acfc 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -30,6 +30,10 @@ + * job. Since the value of @errp is about handling the error, the + * function should not examine it. + * ++ * - The function may pass @errp to functions it calls to pass on ++ * their errors to its caller. If it dereferences @errp to check ++ * for errors, it must use ERRP_GUARD(). ++ * + * - On success, the function should not touch *errp. On failure, it + * should set a new error, e.g. with error_setg(errp, ...), or + * propagate an existing one, e.g. with error_propagate(errp, ...). +@@ -45,15 +49,17 @@ + * = Creating errors = + * + * Create an error: +- * error_setg(&err, "situation normal, all fouled up"); ++ * error_setg(errp, "situation normal, all fouled up"); ++ * where @errp points to the location to receive the error. + * + * Create an error and add additional explanation: +- * error_setg(&err, "invalid quark"); +- * error_append_hint(&err, "Valid quarks are up, down, strange, " ++ * error_setg(errp, "invalid quark"); ++ * error_append_hint(errp, "Valid quarks are up, down, strange, " + * "charm, top, bottom.\n"); ++ * This may require use of ERRP_GUARD(); more on that below. + * + * Do *not* contract this to +- * error_setg(&err, "invalid quark\n" // WRONG! ++ * error_setg(errp, "invalid quark\n" // WRONG! + * "Valid quarks are up, down, strange, charm, top, bottom."); + * + * = Reporting and destroying errors = +@@ -107,18 +113,6 @@ + * Errors get passed to the caller through the conventional @errp + * parameter. + * +- * Pass an existing error to the caller: +- * error_propagate(errp, err); +- * where Error **errp is a parameter, by convention the last one. +- * +- * Pass an existing error to the caller with the message modified: +- * error_propagate_prepend(errp, err, +- * "Could not frobnicate '%s': ", name); +- * This is more concise than +- * error_propagate(errp, err); // don't do this +- * error_prepend(errp, "Could not frobnicate '%s': ", name); +- * and works even when @errp is &error_fatal. +- * + * Create a new error and pass it to the caller: + * error_setg(errp, "situation normal, all fouled up"); + * +@@ -129,18 +123,26 @@ + * handle the error... + * } + * - when it does not, say because it is a void function: ++ * ERRP_GUARD(); ++ * foo(arg, errp); ++ * if (*errp) { ++ * handle the error... ++ * } ++ * More on ERRP_GUARD() below. ++ * ++ * Code predating ERRP_GUARD() still exists, and looks like this: + * Error *err = NULL; + * foo(arg, &err); + * if (err) { + * handle the error... +- * error_propagate(errp, err); ++ * error_propagate(errp, err); // deprecated + * } +- * Do *not* "optimize" this to ++ * Avoid in new code. Do *not* "optimize" it to + * foo(arg, errp); + * if (*errp) { // WRONG! + * handle the error... + * } +- * because errp may be NULL! ++ * because errp may be NULL without the ERRP_GUARD() guard. + * + * But when all you do with the error is pass it on, please use + * foo(arg, errp); +@@ -160,6 +162,19 @@ + * handle the error... + * } + * ++ * Pass an existing error to the caller: ++ * error_propagate(errp, err); ++ * This is rarely needed. When @err is a local variable, use of ++ * ERRP_GUARD() commonly results in more readable code. ++ * ++ * Pass an existing error to the caller with the message modified: ++ * error_propagate_prepend(errp, err, ++ * "Could not frobnicate '%s': ", name); ++ * This is more concise than ++ * error_propagate(errp, err); // don't do this ++ * error_prepend(errp, "Could not frobnicate '%s': ", name); ++ * and works even when @errp is &error_fatal. ++ * + * Receive and accumulate multiple errors (first one wins): + * Error *err = NULL, *local_err = NULL; + * foo(arg, &err); +@@ -187,6 +202,69 @@ + * error_setg(&err, ...); // WRONG! + * } + * because this may pass a non-null err to error_setg(). ++ * ++ * = Why, when and how to use ERRP_GUARD() = ++ * ++ * Without ERRP_GUARD(), use of the @errp parameter is restricted: ++ * - It must not be dereferenced, because it may be null. ++ * - It should not be passed to error_prepend() or ++ * error_append_hint(), because that doesn't work with &error_fatal. ++ * ERRP_GUARD() lifts these restrictions. ++ * ++ * To use ERRP_GUARD(), add it right at the beginning of the function. ++ * @errp can then be used without worrying about the argument being ++ * NULL or &error_fatal. ++ * ++ * Using it when it's not needed is safe, but please avoid cluttering ++ * the source with useless code. ++ * ++ * = Converting to ERRP_GUARD() = ++ * ++ * To convert a function to use ERRP_GUARD(): ++ * ++ * 0. If the Error ** parameter is not named @errp, rename it to ++ * @errp. ++ * ++ * 1. Add an ERRP_GUARD() invocation, by convention right at the ++ * beginning of the function. This makes @errp safe to use. ++ * ++ * 2. Replace &err by errp, and err by *errp. Delete local variable ++ * @err. ++ * ++ * 3. Delete error_propagate(errp, *errp), replace ++ * error_propagate_prepend(errp, *errp, ...) by error_prepend(errp, ...) ++ * ++ * 4. Ensure @errp is valid at return: when you destroy *errp, set ++ * errp = NULL. ++ * ++ * Example: ++ * ++ * bool fn(..., Error **errp) ++ * { ++ * Error *err = NULL; ++ * ++ * foo(arg, &err); ++ * if (err) { ++ * handle the error... ++ * error_propagate(errp, err); ++ * return false; ++ * } ++ * ... ++ * } ++ * ++ * becomes ++ * ++ * bool fn(..., Error **errp) ++ * { ++ * ERRP_GUARD(); ++ * ++ * foo(arg, errp); ++ * if (*errp) { ++ * handle the error... ++ * return false; ++ * } ++ * ... ++ * } + */ + + #ifndef ERROR_H +@@ -287,6 +365,7 @@ void error_setg_win32_internal(Error **errp, + * the error object. + * Else, move the error object from @local_err to *@dst_errp. + * On return, @local_err is invalid. ++ * Please use ERRP_GUARD() instead when possible. + * Please don't error_propagate(&error_fatal, ...), use + * error_report_err() and exit(), because that's more obvious. + */ +@@ -298,6 +377,7 @@ void error_propagate(Error **dst_errp, Error *local_err); + * Behaves like + * error_prepend(&local_err, fmt, ...); + * error_propagate(dst_errp, local_err); ++ * Please use ERRP_GUARD() and error_prepend() instead when possible. + */ + void error_propagate_prepend(Error **dst_errp, Error *local_err, + const char *fmt, ...); +@@ -395,6 +475,46 @@ void error_set_internal(Error **errp, + ErrorClass err_class, const char *fmt, ...) + GCC_FMT_ATTR(6, 7); + ++/* ++ * Make @errp parameter easier to use regardless of argument value ++ * ++ * This macro is for use right at the beginning of a function that ++ * takes an Error **errp parameter to pass errors to its caller. The ++ * parameter must be named @errp. ++ * ++ * It must be used when the function dereferences @errp or passes ++ * @errp to error_prepend(), error_vprepend(), or error_append_hint(). ++ * It is safe to use even when it's not needed, but please avoid ++ * cluttering the source with useless code. ++ * ++ * If @errp is NULL or &error_fatal, rewrite it to point to a local ++ * Error variable, which will be automatically propagated to the ++ * original @errp on function exit. ++ * ++ * Note: &error_abort is not rewritten, because that would move the ++ * abort from the place where the error is created to the place where ++ * it's propagated. ++ */ ++#define ERRP_GUARD() \ ++ g_auto(ErrorPropagator) _auto_errp_prop = {.errp = errp}; \ ++ do { \ ++ if (!errp || errp == &error_fatal) { \ ++ errp = &_auto_errp_prop.local_err; \ ++ } \ ++ } while (0) ++ ++typedef struct ErrorPropagator { ++ Error *local_err; ++ Error **errp; ++} ErrorPropagator; ++ ++static inline void error_propagator_cleanup(ErrorPropagator *prop) ++{ ++ error_propagate(prop->errp, prop->local_err); ++} ++ ++G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(ErrorPropagator, error_propagator_cleanup); ++ + /* + * Special error destination to abort on error. + * See error_setg() and error_propagate() for details. +-- +2.27.0 + diff --git a/kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch b/kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch new file mode 100755 index 0000000..5d44708 --- /dev/null +++ b/kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch @@ -0,0 +1,85 @@ +From 5770fe43fe1e15e6f53cfd3705605e8645b95a98 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 13 Mar 2020 17:17:08 +0000 +Subject: [PATCH 20/20] exec/rom_reset: Free rom data during inmigrate skip +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200313171708.242774-1-dgilbert@redhat.com> +Patchwork-id: 94292 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] exec/rom_reset: Free rom data during inmigrate skip +Bugzilla: 1809380 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Paolo Bonzini + +From: "Dr. David Alan Gilbert" + +bz: https://bugzilla.redhat.com/show_bug.cgi?id=1809380 +brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27249921 +branch: rhel-av-8.2.0 +upstream: Posted and with review-by, not merged yet + +Commit 355477f8c73e9 skips rom reset when we're an incoming migration +so as not to overwrite shared ram in the ignore-shared migration +optimisation. +However, it's got an unexpected side effect that because it skips +freeing the ROM data, when rom_reset gets called later on, after +migration (e.g. during a reboot), the ROM does get reset to the original +file contents. Because of seabios/x86's weird reboot process +this confuses a reboot into hanging after a migration. + +Fixes: 355477f8c73e9 ("migration: do not rom_reset() during incoming migration") +https://bugzilla.redhat.com/show_bug.cgi?id=1809380 + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/loader.c | 25 ++++++++++++++++--------- + 1 file changed, 16 insertions(+), 9 deletions(-) + +diff --git a/hw/core/loader.c b/hw/core/loader.c +index 5099f27..375b29b 100644 +--- a/hw/core/loader.c ++++ b/hw/core/loader.c +@@ -1118,19 +1118,26 @@ static void rom_reset(void *unused) + { + Rom *rom; + +- /* +- * We don't need to fill in the RAM with ROM data because we'll fill +- * the data in during the next incoming migration in all cases. Note +- * that some of those RAMs can actually be modified by the guest on ARM +- * so this is probably the only right thing to do here. +- */ +- if (runstate_check(RUN_STATE_INMIGRATE)) +- return; +- + QTAILQ_FOREACH(rom, &roms, next) { + if (rom->fw_file) { + continue; + } ++ /* ++ * We don't need to fill in the RAM with ROM data because we'll fill ++ * the data in during the next incoming migration in all cases. Note ++ * that some of those RAMs can actually be modified by the guest. ++ */ ++ if (runstate_check(RUN_STATE_INMIGRATE)) { ++ if (rom->data && rom->isrom) { ++ /* ++ * Free it so that a rom_reset after migration doesn't ++ * overwrite a potentially modified 'rom'. ++ */ ++ rom_free_data(rom); ++ } ++ continue; ++ } ++ + if (rom->data == NULL) { + continue; + } +-- +1.8.3.1 + diff --git a/kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch b/kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch new file mode 100755 index 0000000..aa47108 --- /dev/null +++ b/kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch @@ -0,0 +1,96 @@ +From 4e553943c8fe4924d194884b4719c5459210c686 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Tue, 26 Jan 2021 17:21:03 -0500 +Subject: [PATCH 8/9] file-posix: Allow byte-aligned O_DIRECT with NFS + +RH-Author: Kevin Wolf +Message-id: <20210126172103.136060-3-kwolf@redhat.com> +Patchwork-id: 100785 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/2] file-posix: Allow byte-aligned O_DIRECT with NFS +Bugzilla: 1834281 +RH-Acked-by: Markus Armbruster +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +Since commit a6b257a08e3 ('file-posix: Handle undetectable alignment'), +we assume that if we open a file with O_DIRECT and alignment probing +returns 1, we just couldn't find out the real alignment requirement +because some filesystems make the requirement only for allocated blocks. +In this case, a safe default of 4k is used. + +This is too strict for NFS, which does actually allow byte-aligned +requests even with O_DIRECT. Because we can't distinguish both cases +with generic code, let's just look at the file system magic and disable +s->needs_alignment for NFS. This way, O_DIRECT can still be used on NFS +for images that are not aligned to 4k. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200716142601.111237-3-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 5edc85571e7b7269dce408735eba7507f18ac666) +Signed-off-by: Kevin Wolf +Signed-off-by: Jon Maloy +--- + block/file-posix.c | 26 +++++++++++++++++++++++++- + 1 file changed, 25 insertions(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index adafbfa1be..2d834fbdf6 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -61,10 +61,12 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + #include + #ifdef __s390__ + #include +@@ -298,6 +300,28 @@ static int probe_physical_blocksize(int fd, unsigned int *blk_size) + #endif + } + ++/* ++ * Returns true if no alignment restrictions are necessary even for files ++ * opened with O_DIRECT. ++ * ++ * raw_probe_alignment() probes the required alignment and assume that 1 means ++ * the probing failed, so it falls back to a safe default of 4k. This can be ++ * avoided if we know that byte alignment is okay for the file. ++ */ ++static bool dio_byte_aligned(int fd) ++{ ++#ifdef __linux__ ++ struct statfs buf; ++ int ret; ++ ++ ret = fstatfs(fd, &buf); ++ if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) { ++ return true; ++ } ++#endif ++ return false; ++} ++ + /* Check if read is allowed with given memory buffer and length. + * + * This function is used to check O_DIRECT memory buffer and request alignment. +@@ -602,7 +626,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, + + s->has_discard = true; + s->has_write_zeroes = true; +- if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { ++ if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) { + s->needs_alignment = true; + } + +-- +2.18.2 + diff --git a/kvm-file-posix-Drop-hdev_co_create_opts.patch b/kvm-file-posix-Drop-hdev_co_create_opts.patch new file mode 100755 index 0000000..ea2edbd --- /dev/null +++ b/kvm-file-posix-Drop-hdev_co_create_opts.patch @@ -0,0 +1,131 @@ +From 3d3509c010129bd15eb1f5ec1a7b9eedcdbf23f6 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:44 +0000 +Subject: [PATCH 03/20] file-posix: Drop hdev_co_create_opts() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-4-mlevitsk@redhat.com> +Patchwork-id: 94225 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 3/6] file-posix: Drop hdev_co_create_opts() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +The generic fallback implementation effectively does the same. + +Reviewed-by: Maxim Levitsky +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-4-mreitz@redhat.com> +Signed-off-by: Max Reitz +(cherry picked from commit 87ca3b8fa615b278b33cabf9ed22b3f44b5214ba) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 67 ------------------------------------------------------ + 1 file changed, 67 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 1b805bd..fd29372 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -3418,67 +3418,6 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, + return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true); + } + +-static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts, +- Error **errp) +-{ +- int fd; +- int ret = 0; +- struct stat stat_buf; +- int64_t total_size = 0; +- bool has_prefix; +- +- /* This function is used by both protocol block drivers and therefore either +- * of these prefixes may be given. +- * The return value has to be stored somewhere, otherwise this is an error +- * due to -Werror=unused-value. */ +- has_prefix = +- strstart(filename, "host_device:", &filename) || +- strstart(filename, "host_cdrom:" , &filename); +- +- (void)has_prefix; +- +- ret = raw_normalize_devicepath(&filename, errp); +- if (ret < 0) { +- return ret; +- } +- +- /* Read out options */ +- total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +- BDRV_SECTOR_SIZE); +- +- fd = qemu_open(filename, O_WRONLY | O_BINARY); +- if (fd < 0) { +- ret = -errno; +- error_setg_errno(errp, -ret, "Could not open device"); +- return ret; +- } +- +- if (fstat(fd, &stat_buf) < 0) { +- ret = -errno; +- error_setg_errno(errp, -ret, "Could not stat device"); +- } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { +- error_setg(errp, +- "The given file is neither a block nor a character device"); +- ret = -ENODEV; +- } else if (lseek(fd, 0, SEEK_END) < total_size) { +- error_setg(errp, "Device is too small"); +- ret = -ENOSPC; +- } +- +- if (!ret && total_size) { +- uint8_t buf[BDRV_SECTOR_SIZE] = { 0 }; +- int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size); +- if (lseek(fd, 0, SEEK_SET) == -1) { +- ret = -errno; +- } else { +- ret = qemu_write_full(fd, buf, zero_size); +- ret = ret == zero_size ? 0 : -errno; +- } +- } +- qemu_close(fd); +- return ret; +-} +- + static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .protocol_name = "host_device", +@@ -3491,8 +3430,6 @@ static BlockDriver bdrv_host_device = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, +- .bdrv_co_create_opts = hdev_co_create_opts, +- .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, +@@ -3619,8 +3556,6 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, +- .bdrv_co_create_opts = hdev_co_create_opts, +- .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + +@@ -3753,8 +3688,6 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, +- .bdrv_co_create_opts = hdev_co_create_opts, +- .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, + + .bdrv_co_preadv = raw_co_preadv, +-- +1.8.3.1 + diff --git a/kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch b/kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch new file mode 100755 index 0000000..ac7b859 --- /dev/null +++ b/kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch @@ -0,0 +1,59 @@ +From 94d99b13b48e922861570f043490efc966b3b445 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 25 Jun 2021 17:41:04 -0400 +Subject: [PATCH 4/4] file-posix: Handle `EINVAL` fallocate return value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Kevin Wolf +Message-id: <20210625174104.44313-3-kwolf@redhat.com> +Patchwork-id: 101778 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] file-posix: Handle `EINVAL` fallocate return value +Bugzilla: 1970912 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Antoine Damhet + +The `detect-zeroes=unmap` option may issue unaligned +`FALLOC_FL_PUNCH_HOLE` requests, raw block devices can (and will) return +`EINVAL`, qemu should then write the zeroes to the blockdev instead of +issuing an `IO_ERROR`. + +The problem can be reprodced like this: + +$ qemu-io -c 'write -P 0 42 1234' --image-opts driver=host_device,filename=/dev/loop0,detect-zeroes=unmap +write failed: Invalid argument + +Signed-off-by: Antoine Damhet +Message-Id: <20200717135603.51180-1-antoine.damhet@blade-group.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit bae127d4dcf6158c5042e2eee9582430839a9967) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 837edcf027..6cd19e6c9a 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1632,7 +1632,11 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) + #ifdef CONFIG_FALLOCATE_PUNCH_HOLE + int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); +- if (ret != -ENOTSUP) { ++ switch (ret) { ++ case -ENOTSUP: ++ case -EINVAL: ++ break; ++ default: + return ret; + } + #endif +-- +2.27.0 + diff --git a/kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch b/kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch new file mode 100755 index 0000000..e8639f3 --- /dev/null +++ b/kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch @@ -0,0 +1,466 @@ +From 7ee01b5ccb7fc660dafaf3fdb1578649d17fbddf Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 26 May 2021 09:05:52 -0400 +Subject: [PATCH 1/4] file-posix: Mitigate file fragmentation with extent size + hints + +RH-Author: Kevin Wolf +Message-id: <20210526090552.155820-2-kwolf@redhat.com> +Patchwork-id: 101638 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] file-posix: Mitigate file fragmentation with extent size hints +Bugzilla: 1877163 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +Especially when O_DIRECT is used with image files so that the page cache +indirection can't cause a merge of allocating requests, the file will +fragment on the file system layer, with a potentially very small +fragment size (this depends on the requests the guest sent). + +On Linux, fragmentation can be reduced by setting an extent size hint +when creating the file (at least on XFS, it can't be set any more after +the first extent has been allocated), basically giving raw files a +"cluster size" for allocation. + +This adds a create option to set the extent size hint, and changes the +default from not setting a hint to setting it to 1 MB. The main reason +why qcow2 defaults to smaller cluster sizes is that COW becomes more +expensive, which is not an issue with raw files, so we can choose a +larger size. The tradeoff here is only potentially wasted disk space. + +For qcow2 (or other image formats) over file-posix, the advantage should +even be greater because they grow sequentially without leaving holes, so +there won't be wasted space. Setting even larger extent size hints for +such images may make sense. This can be done with the new option, but +let's keep the default conservative for now. + +The effect is very visible with a test that intentionally creates a +badly fragmented file with qemu-img bench (the time difference while +creating the file is already remarkable) and then looks at the number of +extents and the time a simple "qemu-img map" takes. + +Without an extent size hint: + + $ ./qemu-img create -f raw -o extent_size_hint=0 ~/tmp/test.raw 10G + Formatting '/home/kwolf/tmp/test.raw', fmt=raw size=10737418240 extent_size_hint=0 + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 0 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 0, step size 8192) + Run completed in 25.848 seconds. + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 4096 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 4096, step size 8192) + Run completed in 19.616 seconds. + $ filefrag ~/tmp/test.raw + /home/kwolf/tmp/test.raw: 2000000 extents found + $ time ./qemu-img map ~/tmp/test.raw + Offset Length Mapped to File + 0 0x1e8480000 0 /home/kwolf/tmp/test.raw + + real 0m1,279s + user 0m0,043s + sys 0m1,226s + +With the new default extent size hint of 1 MB: + + $ ./qemu-img create -f raw -o extent_size_hint=1M ~/tmp/test.raw 10G + Formatting '/home/kwolf/tmp/test.raw', fmt=raw size=10737418240 extent_size_hint=1048576 + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 0 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 0, step size 8192) + Run completed in 11.833 seconds. + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 4096 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 4096, step size 8192) + Run completed in 10.155 seconds. + $ filefrag ~/tmp/test.raw + /home/kwolf/tmp/test.raw: 178 extents found + $ time ./qemu-img map ~/tmp/test.raw + Offset Length Mapped to File + 0 0x1e8480000 0 /home/kwolf/tmp/test.raw + + real 0m0,061s + user 0m0,040s + sys 0m0,014s + +Signed-off-by: Kevin Wolf +Message-Id: <20200707142329.48303-1-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit ffa244c84a1a30dff69ecc80b0137a2b6d428ecb) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 44 ++++++++++++++++++++++++++++++++ + include/block/block_int.h | 1 + + qapi/block-core.json | 11 +++++--- + tests/qemu-iotests/082.out | 16 ++++++++++++ + tests/qemu-iotests/106 | 7 +++-- + tests/qemu-iotests/175 | 6 ++--- + tests/qemu-iotests/243 | 6 ++--- + tests/qemu-iotests/common.filter | 1 + + 8 files changed, 80 insertions(+), 12 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 2d834fbdf6..62a463229f 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -30,6 +30,7 @@ + #include "block/block_int.h" + #include "qemu/module.h" + #include "qemu/option.h" ++#include "qemu/units.h" + #include "trace.h" + #include "block/thread-pool.h" + #include "qemu/iov.h" +@@ -2289,6 +2290,14 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) + if (!file_opts->has_preallocation) { + file_opts->preallocation = PREALLOC_MODE_OFF; + } ++ if (!file_opts->has_extent_size_hint) { ++ file_opts->extent_size_hint = 1 * MiB; ++ } ++ if (file_opts->extent_size_hint > UINT32_MAX) { ++ result = -EINVAL; ++ error_setg(errp, "Extent size hint is too large"); ++ goto out; ++ } + + /* Create file */ + fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644); +@@ -2346,6 +2355,27 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) + } + #endif + } ++#ifdef FS_IOC_FSSETXATTR ++ /* ++ * Try to set the extent size hint. Failure is not fatal, and a warning is ++ * only printed if the option was explicitly specified. ++ */ ++ { ++ struct fsxattr attr; ++ result = ioctl(fd, FS_IOC_FSGETXATTR, &attr); ++ if (result == 0) { ++ attr.fsx_xflags |= FS_XFLAG_EXTSIZE; ++ attr.fsx_extsize = file_opts->extent_size_hint; ++ result = ioctl(fd, FS_IOC_FSSETXATTR, &attr); ++ } ++ if (result < 0 && file_opts->has_extent_size_hint && ++ file_opts->extent_size_hint) ++ { ++ warn_report("Failed to set extent size hint: %s", ++ strerror(errno)); ++ } ++ } ++#endif + + /* Resize and potentially preallocate the file to the desired + * final size */ +@@ -2381,6 +2411,8 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + { + BlockdevCreateOptions options; + int64_t total_size = 0; ++ int64_t extent_size_hint = 0; ++ bool has_extent_size_hint = false; + bool nocow = false; + PreallocMode prealloc; + char *buf = NULL; +@@ -2392,6 +2424,11 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); ++ if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) { ++ has_extent_size_hint = true; ++ extent_size_hint = ++ qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1); ++ } + nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, +@@ -2411,6 +2448,8 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + .preallocation = prealloc, + .has_nocow = true, + .nocow = nocow, ++ .has_extent_size_hint = has_extent_size_hint, ++ .extent_size_hint = extent_size_hint, + }, + }; + return raw_co_create(&options, errp); +@@ -2902,6 +2941,11 @@ static QemuOptsList raw_create_opts = { + #endif + ", full)" + }, ++ { ++ .name = BLOCK_OPT_EXTENT_SIZE_HINT, ++ .type = QEMU_OPT_SIZE, ++ .help = "Extent size hint for the image file, 0 to disable" ++ }, + { /* end of list */ } + } + }; +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 41f13ecbed..4b23da2eb0 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -53,6 +53,7 @@ + #define BLOCK_OPT_ADAPTER_TYPE "adapter_type" + #define BLOCK_OPT_REDUNDANCY "redundancy" + #define BLOCK_OPT_NOCOW "nocow" ++#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint" + #define BLOCK_OPT_OBJECT_SIZE "object_size" + #define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" + #define BLOCK_OPT_DATA_FILE "data_file" +diff --git a/qapi/block-core.json b/qapi/block-core.json +index 289320902d..c7aa919fa3 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -4272,14 +4272,17 @@ + # falloc (if defined CONFIG_POSIX_FALLOCATE), + # full (if defined CONFIG_POSIX)) + # @nocow Turn off copy-on-write (valid only on btrfs; default: off) ++# @extent-size-hint: Extent size hint to add to the image file; 0 for not ++# adding an extent size hint (default: 1 MB, since 5.1) + # + # Since: 2.12 + ## + { 'struct': 'BlockdevCreateOptionsFile', +- 'data': { 'filename': 'str', +- 'size': 'size', +- '*preallocation': 'PreallocMode', +- '*nocow': 'bool' } } ++ 'data': { 'filename': 'str', ++ 'size': 'size', ++ '*preallocation': 'PreallocMode', ++ '*nocow': 'bool', ++ '*extent-size-hint': 'size'} } + + ## + # @BlockdevCreateOptionsGluster: +diff --git a/tests/qemu-iotests/082.out b/tests/qemu-iotests/082.out +index 9d4ed4dc9d..7a87946fa2 100644 +--- a/tests/qemu-iotests/082.out ++++ b/tests/qemu-iotests/082.out +@@ -59,6 +59,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -82,6 +83,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -105,6 +107,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -128,6 +131,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -151,6 +155,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -174,6 +179,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -197,6 +203,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -220,6 +227,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -339,6 +347,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -362,6 +371,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -385,6 +395,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -408,6 +419,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -431,6 +443,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -454,6 +467,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -477,6 +491,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -500,6 +515,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +diff --git a/tests/qemu-iotests/106 b/tests/qemu-iotests/106 +index ac47eaa0f5..ee6f51d08b 100755 +--- a/tests/qemu-iotests/106 ++++ b/tests/qemu-iotests/106 +@@ -51,7 +51,10 @@ for create_mode in off falloc full; do + echo + echo "--- create_mode=$create_mode growth_mode=$growth_mode ---" + +- IMGOPTS="preallocation=$create_mode" _make_test_img ${CREATION_SIZE}K ++ # Our calculation below assumes kilobytes as unit for the actual size. ++ # Disable the extent size hint because it would give us a result in ++ # megabytes. ++ IMGOPTS="preallocation=$create_mode,extent_size_hint=0" _make_test_img ${CREATION_SIZE}K + $QEMU_IMG resize -f "$IMGFMT" --preallocation=$growth_mode "$TEST_IMG" +${GROWTH_SIZE}K + + expected_size=0 +@@ -98,7 +101,7 @@ for growth_mode in falloc full; do + # plain int. We should use the correct type for the result, and + # this tests we do. + +- _make_test_img 2G ++ _make_test_img -o "extent_size_hint=0" 2G + $QEMU_IMG resize -f "$IMGFMT" --preallocation=$growth_mode "$TEST_IMG" +${GROWTH_SIZE}K + + actual_size=$($QEMU_IMG info -f "$IMGFMT" "$TEST_IMG" | grep 'disk size') +diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175 +index 55db2803ed..8a8494aeb6 100755 +--- a/tests/qemu-iotests/175 ++++ b/tests/qemu-iotests/175 +@@ -89,20 +89,20 @@ min_blocks=$(stat -c '%b' "$TEST_DIR/empty") + + echo + echo "== creating image with default preallocation ==" +-_make_test_img $size | _filter_imgfmt ++_make_test_img -o extent_size_hint=0 $size | _filter_imgfmt + stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size + + for mode in off full falloc; do + echo + echo "== creating image with preallocation $mode ==" +- IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt ++ IMGOPTS="preallocation=$mode,extent_size_hint=0" _make_test_img $size | _filter_imgfmt + stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size + done + + for new_size in 4096 1048576; do + echo + echo "== resize empty image with block_resize ==" +- _make_test_img 0 | _filter_imgfmt ++ _make_test_img -o extent_size_hint=0 0 | _filter_imgfmt + _block_resize $TEST_IMG $new_size >/dev/null + stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $new_size + done +diff --git a/tests/qemu-iotests/243 b/tests/qemu-iotests/243 +index e563761307..104c7256c4 100755 +--- a/tests/qemu-iotests/243 ++++ b/tests/qemu-iotests/243 +@@ -47,7 +47,7 @@ for mode in off metadata falloc full; do + echo "=== preallocation=$mode ===" + echo + +- IMGOPTS="preallocation=$mode" _make_test_img 64M ++ IMGOPTS="preallocation=$mode,extent_size_hint=0" _make_test_img 64M + + printf "File size: " + du -b $TEST_IMG | cut -f1 +@@ -64,7 +64,7 @@ for mode in off metadata falloc full; do + echo "=== External data file: preallocation=$mode ===" + echo + +- IMGOPTS="data_file=$TEST_IMG.data,preallocation=$mode" _make_test_img 64M ++ IMGOPTS="data_file=$TEST_IMG.data,preallocation=$mode,extent_size_hint=0" _make_test_img 64M + + echo -n "qcow2 file size: " + du -b $TEST_IMG | cut -f1 +@@ -75,7 +75,7 @@ for mode in off metadata falloc full; do + echo -n "qcow2 disk usage: " + [ $(du -B1 $TEST_IMG | cut -f1) -lt 1048576 ] && echo "low" || echo "high" + echo -n "data disk usage: " +- [ $(du -B1 $TEST_IMG.data | cut -f1) -lt 1048576 ] && echo "low" || echo "high" ++ [ $(du -B1 $TEST_IMG.data | cut -f1) -lt 2097152 ] && echo "low" || echo "high" + + done + +diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter +index c8e8663665..f29c1d3238 100644 +--- a/tests/qemu-iotests/common.filter ++++ b/tests/qemu-iotests/common.filter +@@ -146,6 +146,7 @@ _filter_img_create() + -e "s# refcount_bits=[0-9]\\+##g" \ + -e "s# key-secret=[a-zA-Z0-9]\\+##g" \ + -e "s# iter-time=[0-9]\\+##g" \ ++ -e "s# extent_size_hint=[0-9]\\+##g" \ + -e "s# force_size=\\(on\\|off\\)##g" + } + +-- +2.27.0 + diff --git a/kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch b/kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch new file mode 100755 index 0000000..efdf16b --- /dev/null +++ b/kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch @@ -0,0 +1,48 @@ +From 55bfda3a0e077b822f57e8ed901f0cee848bc471 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:35 +0100 +Subject: [PATCH 07/17] file-posix: Support BDRV_REQ_ZERO_WRITE for truncate + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-7-kwolf@redhat.com> +Patchwork-id: 97452 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 06/11] file-posix: Support BDRV_REQ_ZERO_WRITE for truncate +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +For regular files, we always get BDRV_REQ_ZERO_WRITE behaviour from the +OS, so we can advertise the flag and just ignore it. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-7-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 2f0c6e7a650de133eccd94e9bb6cf7b2070f07f1) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 7551e8d..adafbfa 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -674,6 +674,10 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, + #endif + + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; ++ if (S_ISREG(st.st_mode)) { ++ /* When extending regular files, we get zeros from the OS */ ++ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; ++ } + ret = 0; + fail: + if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { +-- +1.8.3.1 + diff --git a/kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch b/kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch new file mode 100755 index 0000000..551b2eb --- /dev/null +++ b/kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch @@ -0,0 +1,89 @@ +From 15331267d11713906361ddd767c3e04ae46d9a83 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:50 -0400 +Subject: [PATCH 01/14] glib-compat: add g_unix_get_passwd_entry_qemu() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-2-marcandre.lureau@redhat.com> +Patchwork-id: 101687 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/4] glib-compat: add g_unix_get_passwd_entry_qemu() +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Marc-André Lureau + +The glib function was introduced in 2.64. It's a safer version of +getpwnam, and also simpler to use than getpwnam_r. + +Currently, it's only use by the next patch in qemu-ga, which doesn't +(well well...) need the thread safety guarantees. Since the fallback +version is still unsafe, I would rather keep the _qemu postfix, to make +sure it's not being misused by mistake. When/if necessary, we can +implement a safer fallback and drop the _qemu suffix. + +Signed-off-by: Marc-André Lureau +Reviewed-by: Michal Privoznik +*fix checkpatch warnings about newlines before/after block comments +Signed-off-by: Michael Roth + +(cherry picked from commit 6d593ab451c490b0ca941c6a519894231634751e) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + include/glib-compat.h | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/include/glib-compat.h b/include/glib-compat.h +index 0b0ec76299..695a96f7ea 100644 +--- a/include/glib-compat.h ++++ b/include/glib-compat.h +@@ -30,6 +30,11 @@ + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" + + #include ++#if defined(G_OS_UNIX) ++#include ++#include ++#include ++#endif + + /* + * Note that because of the GLIB_VERSION_MAX_ALLOWED constant above, allowing +@@ -72,6 +77,29 @@ + gint g_poll_fixed(GPollFD *fds, guint nfds, gint timeout); + #endif + ++#if defined(G_OS_UNIX) ++/* ++ * Note: The fallback implementation is not MT-safe, and it returns a copy of ++ * the libc passwd (must be g_free() after use) but not the content. Because of ++ * these important differences the caller must be aware of, it's not #define for ++ * GLib API substitution. ++ */ ++static inline struct passwd * ++g_unix_get_passwd_entry_qemu(const gchar *user_name, GError **error) ++{ ++#if GLIB_CHECK_VERSION(2, 64, 0) ++ return g_unix_get_passwd_entry(user_name, error); ++#else ++ struct passwd *p = getpwnam(user_name); ++ if (!p) { ++ g_set_error_literal(error, G_UNIX_ERROR, 0, g_strerror(errno)); ++ return NULL; ++ } ++ return (struct passwd *)g_memdup(p, sizeof(*p)); ++#endif ++} ++#endif /* G_OS_UNIX */ ++ + #pragma GCC diagnostic pop + + #endif +-- +2.27.0 + diff --git a/kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch b/kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch new file mode 100755 index 0000000..e34f576 --- /dev/null +++ b/kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch @@ -0,0 +1,275 @@ +From a0816e4374759048cb24b9b3549a093a2ccb6240 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:50 +0100 +Subject: [PATCH 07/12] hmat acpi: Build Memory Proximity Domain Attributes + Structure(s) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-7-plai@redhat.com> +Patchwork-id: 96734 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 06/11] hmat acpi: Build Memory Proximity Domain Attributes Structure(s) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table +(HMAT). The specification references below link: +http://www.uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf + +It describes the memory attributes, such as memory side cache +attributes and bandwidth and latency details, related to the +Memory Proximity Domain. The software is +expected to use this information as hint for optimization. + +This structure describes Memory Proximity Domain Attributes by memory +subsystem and its associativity with processor proximity domain as well as +hint for memory usage. + +In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report +the platform's HMAT tables. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Daniel Black +Reviewed-by: Jonathan Cameron +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-5-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit e6f123c3b81241be33f1b763d0ff8b36d1ae9c1e) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/Kconfig | 7 ++-- + hw/acpi/Makefile.objs | 1 + + hw/acpi/hmat.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/acpi/hmat.h | 42 ++++++++++++++++++++++ + hw/i386/acpi-build.c | 5 +++ + 5 files changed, 152 insertions(+), 2 deletions(-) + create mode 100644 hw/acpi/hmat.c + create mode 100644 hw/acpi/hmat.h + +diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig +index 12e3f1e..54209c6 100644 +--- a/hw/acpi/Kconfig ++++ b/hw/acpi/Kconfig +@@ -7,6 +7,7 @@ config ACPI_X86 + select ACPI_NVDIMM + select ACPI_CPU_HOTPLUG + select ACPI_MEMORY_HOTPLUG ++ select ACPI_HMAT + + config ACPI_X86_ICH + bool +@@ -23,6 +24,10 @@ config ACPI_NVDIMM + bool + depends on ACPI + ++config ACPI_HMAT ++ bool ++ depends on ACPI ++ + config ACPI_PCI + bool + depends on ACPI && PCI +@@ -33,5 +38,3 @@ config ACPI_VMGENID + depends on PC + + config ACPI_HW_REDUCED +- bool +- depends on ACPI +diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs +index 655a9c1..517bd88 100644 +--- a/hw/acpi/Makefile.objs ++++ b/hw/acpi/Makefile.objs +@@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o + common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o + common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o + common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o ++common-obj-$(CONFIG_ACPI_HMAT) += hmat.o + common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o + + common-obj-y += acpi_interface.o +diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c +new file mode 100644 +index 0000000..9ff7930 +--- /dev/null ++++ b/hw/acpi/hmat.c +@@ -0,0 +1,99 @@ ++/* ++ * HMAT ACPI Implementation ++ * ++ * Copyright(C) 2019 Intel Corporation. ++ * ++ * Author: ++ * Liu jingqi ++ * Tao Xu ++ * ++ * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table ++ * (HMAT) ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, see ++ */ ++ ++#include "qemu/osdep.h" ++#include "sysemu/numa.h" ++#include "hw/acpi/hmat.h" ++ ++/* ++ * ACPI 6.3: ++ * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 ++ */ ++static void build_hmat_mpda(GArray *table_data, uint16_t flags, ++ uint32_t initiator, uint32_t mem_node) ++{ ++ ++ /* Memory Proximity Domain Attributes Structure */ ++ /* Type */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Length */ ++ build_append_int_noprefix(table_data, 40, 4); ++ /* Flags */ ++ build_append_int_noprefix(table_data, flags, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Proximity Domain for the Attached Initiator */ ++ build_append_int_noprefix(table_data, initiator, 4); ++ /* Proximity Domain for the Memory */ ++ build_append_int_noprefix(table_data, mem_node, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 4); ++ /* ++ * Reserved: ++ * Previously defined as the Start Address of the System Physical ++ * Address Range. Deprecated since ACPI Spec 6.3. ++ */ ++ build_append_int_noprefix(table_data, 0, 8); ++ /* ++ * Reserved: ++ * Previously defined as the Range Length of the region in bytes. ++ * Deprecated since ACPI Spec 6.3. ++ */ ++ build_append_int_noprefix(table_data, 0, 8); ++} ++ ++/* Build HMAT sub table structures */ ++static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) ++{ ++ uint16_t flags; ++ int i; ++ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ flags = 0; ++ ++ if (numa_state->nodes[i].initiator < MAX_NODES) { ++ flags |= HMAT_PROXIMITY_INITIATOR_VALID; ++ } ++ ++ build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); ++ } ++} ++ ++void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) ++{ ++ int hmat_start = table_data->len; ++ ++ /* reserve space for HMAT header */ ++ acpi_data_push(table_data, 40); ++ ++ hmat_build_table_structs(table_data, numa_state); ++ ++ build_header(linker, table_data, ++ (void *)(table_data->data + hmat_start), ++ "HMAT", table_data->len - hmat_start, 2, NULL, NULL); ++} +diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h +new file mode 100644 +index 0000000..437dbc6 +--- /dev/null ++++ b/hw/acpi/hmat.h +@@ -0,0 +1,42 @@ ++/* ++ * HMAT ACPI Implementation Header ++ * ++ * Copyright(C) 2019 Intel Corporation. ++ * ++ * Author: ++ * Liu jingqi ++ * Tao Xu ++ * ++ * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table ++ * (HMAT) ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, see ++ */ ++ ++#ifndef HMAT_H ++#define HMAT_H ++ ++#include "hw/acpi/aml-build.h" ++ ++/* ++ * ACPI 6.3: 5.2.27.3 Memory Proximity Domain Attributes Structure, ++ * Table 5-145, Field "flag", Bit [0]: set to 1 to indicate that data in ++ * the Proximity Domain for the Attached Initiator field is valid. ++ * Other bits reserved. ++ */ ++#define HMAT_PROXIMITY_INITIATOR_VALID 0x1 ++ ++void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state); ++ ++#endif +diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c +index 6400189..b1f8c55 100644 +--- a/hw/i386/acpi-build.c ++++ b/hw/i386/acpi-build.c +@@ -67,6 +67,7 @@ + #include "hw/i386/intel_iommu.h" + + #include "hw/acpi/ipmi.h" ++#include "hw/acpi/hmat.h" + + /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and + * -M pc-i440fx-2.0. Even if the actual amount of AML generated grows +@@ -2837,6 +2838,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) + acpi_add_table(table_offsets, tables_blob); + build_slit(tables_blob, tables->linker, machine); + } ++ if (machine->numa_state->hmat_enabled) { ++ acpi_add_table(table_offsets, tables_blob); ++ build_hmat(tables_blob, tables->linker, machine->numa_state); ++ } + } + if (acpi_get_mcfg(&mcfg)) { + acpi_add_table(table_offsets, tables_blob); +-- +1.8.3.1 + diff --git a/kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch b/kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch new file mode 100755 index 0000000..01ef4ce --- /dev/null +++ b/kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch @@ -0,0 +1,137 @@ +From d00453667cb972dc2fe1242081d3b39313a6a925 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:52 +0100 +Subject: [PATCH 09/12] hmat acpi: Build Memory Side Cache Information + Structure(s) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-9-plai@redhat.com> +Patchwork-id: 96741 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 08/11] hmat acpi: Build Memory Side Cache Information Structure(s) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +This structure describes memory side cache information for memory +proximity domains if the memory side cache is present and the +physical device forms the memory side cache. +The software could use this information to effectively place +the data in memory to maximize the performance of the system +memory that use the memory side cache. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Daniel Black +Reviewed-by: Jonathan Cameron +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-7-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit a9c2b841af002db6e21e1297c9026b63fc22c875) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/hmat.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 68 insertions(+), 1 deletion(-) + +diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c +index 4635d45..7c24bb5 100644 +--- a/hw/acpi/hmat.c ++++ b/hw/acpi/hmat.c +@@ -143,14 +143,62 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + g_free(entry_list); + } + ++/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ ++static void build_hmat_cache(GArray *table_data, uint8_t total_levels, ++ NumaHmatCacheOptions *hmat_cache) ++{ ++ /* ++ * Cache Attributes: Bits [3:0] – Total Cache Levels ++ * for this Memory Proximity Domain ++ */ ++ uint32_t cache_attr = total_levels; ++ ++ /* Bits [7:4] : Cache Level described in this structure */ ++ cache_attr |= (uint32_t) hmat_cache->level << 4; ++ ++ /* Bits [11:8] - Cache Associativity */ ++ cache_attr |= (uint32_t) hmat_cache->associativity << 8; ++ ++ /* Bits [15:12] - Write Policy */ ++ cache_attr |= (uint32_t) hmat_cache->policy << 12; ++ ++ /* Bits [31:16] - Cache Line size in bytes */ ++ cache_attr |= (uint32_t) hmat_cache->line << 16; ++ ++ /* Type */ ++ build_append_int_noprefix(table_data, 2, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Length */ ++ build_append_int_noprefix(table_data, 32, 4); ++ /* Proximity Domain for the Memory */ ++ build_append_int_noprefix(table_data, hmat_cache->node_id, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 4); ++ /* Memory Side Cache Size */ ++ build_append_int_noprefix(table_data, hmat_cache->size, 8); ++ /* Cache Attributes */ ++ build_append_int_noprefix(table_data, cache_attr, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* ++ * Number of SMBIOS handles (n) ++ * Linux kernel uses Memory Side Cache Information Structure ++ * without SMBIOS entries for now, so set Number of SMBIOS handles ++ * as 0. ++ */ ++ build_append_int_noprefix(table_data, 0, 2); ++} ++ + /* Build HMAT sub table structures */ + static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + { + uint16_t flags; + uint32_t num_initiator = 0; + uint32_t initiator_list[MAX_NODES]; +- int i, hierarchy, type; ++ int i, hierarchy, type, cache_level, total_levels; + HMAT_LB_Info *hmat_lb; ++ NumaHmatCacheOptions *hmat_cache; + + for (i = 0; i < numa_state->num_nodes; i++) { + flags = 0; +@@ -184,6 +232,25 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + } + } + } ++ ++ /* ++ * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: ++ * Table 5-147 ++ */ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ total_levels = 0; ++ for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { ++ if (numa_state->hmat_cache[i][cache_level]) { ++ total_levels++; ++ } ++ } ++ for (cache_level = 0; cache_level <= total_levels; cache_level++) { ++ hmat_cache = numa_state->hmat_cache[i][cache_level]; ++ if (hmat_cache) { ++ build_hmat_cache(table_data, total_levels, hmat_cache); ++ } ++ } ++ } + } + + void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) +-- +1.8.3.1 + diff --git a/kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch b/kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch new file mode 100755 index 0000000..a7120d7 --- /dev/null +++ b/kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch @@ -0,0 +1,173 @@ +From f55b8b251c323856087baf2380d93fbf2da15db7 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:51 +0100 +Subject: [PATCH 08/12] hmat acpi: Build System Locality Latency and Bandwidth + Information Structure(s) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-8-plai@redhat.com> +Patchwork-id: 96733 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 07/11] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +This structure describes the memory access latency and bandwidth +information from various memory access initiator proximity domains. +The latency and bandwidth numbers represented in this structure +correspond to rated latency and bandwidth for the platform. +The software could use this information as hint for optimization. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-6-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 4586a2cb833f80b19c80ebe364a005ac2fa0974a) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/hmat.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 103 insertions(+), 1 deletion(-) + +diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c +index 9ff7930..4635d45 100644 +--- a/hw/acpi/hmat.c ++++ b/hw/acpi/hmat.c +@@ -25,6 +25,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/units.h" + #include "sysemu/numa.h" + #include "hw/acpi/hmat.h" + +@@ -67,11 +68,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, + build_append_int_noprefix(table_data, 0, 8); + } + ++/* ++ * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information ++ * Structure: Table 5-146 ++ */ ++static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, ++ uint32_t num_initiator, uint32_t num_target, ++ uint32_t *initiator_list) ++{ ++ int i, index; ++ HMAT_LB_Data *lb_data; ++ uint16_t *entry_list; ++ uint32_t base; ++ /* Length in bytes for entire structure */ ++ uint32_t lb_length ++ = 32 /* Table length upto and including Entry Base Unit */ ++ + 4 * num_initiator /* Initiator Proximity Domain List */ ++ + 4 * num_target /* Target Proximity Domain List */ ++ + 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ ++ ++ /* Type */ ++ build_append_int_noprefix(table_data, 1, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Length */ ++ build_append_int_noprefix(table_data, lb_length, 4); ++ /* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ ++ assert(!(hmat_lb->hierarchy >> 4)); ++ build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); ++ /* Data Type */ ++ build_append_int_noprefix(table_data, hmat_lb->data_type, 1); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Number of Initiator Proximity Domains (s) */ ++ build_append_int_noprefix(table_data, num_initiator, 4); ++ /* Number of Target Proximity Domains (t) */ ++ build_append_int_noprefix(table_data, num_target, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 4); ++ ++ /* Entry Base Unit */ ++ if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { ++ /* Convert latency base from nanoseconds to picosecond */ ++ base = hmat_lb->base * 1000; ++ } else { ++ /* Convert bandwidth base from Byte to Megabyte */ ++ base = hmat_lb->base / MiB; ++ } ++ build_append_int_noprefix(table_data, base, 8); ++ ++ /* Initiator Proximity Domain List */ ++ for (i = 0; i < num_initiator; i++) { ++ build_append_int_noprefix(table_data, initiator_list[i], 4); ++ } ++ ++ /* Target Proximity Domain List */ ++ for (i = 0; i < num_target; i++) { ++ build_append_int_noprefix(table_data, i, 4); ++ } ++ ++ /* Latency or Bandwidth Entries */ ++ entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t)); ++ for (i = 0; i < hmat_lb->list->len; i++) { ++ lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); ++ index = lb_data->initiator * num_target + lb_data->target; ++ ++ entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); ++ } ++ ++ for (i = 0; i < num_initiator * num_target; i++) { ++ build_append_int_noprefix(table_data, entry_list[i], 2); ++ } ++ ++ g_free(entry_list); ++} ++ + /* Build HMAT sub table structures */ + static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + { + uint16_t flags; +- int i; ++ uint32_t num_initiator = 0; ++ uint32_t initiator_list[MAX_NODES]; ++ int i, hierarchy, type; ++ HMAT_LB_Info *hmat_lb; + + for (i = 0; i < numa_state->num_nodes; i++) { + flags = 0; +@@ -82,6 +161,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + + build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); + } ++ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ if (numa_state->nodes[i].has_cpu) { ++ initiator_list[num_initiator++] = i; ++ } ++ } ++ ++ /* ++ * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information ++ * Structure: Table 5-146 ++ */ ++ for (hierarchy = HMAT_LB_MEM_MEMORY; ++ hierarchy <= HMAT_LB_MEM_CACHE_3RD_LEVEL; hierarchy++) { ++ for (type = HMAT_LB_DATA_ACCESS_LATENCY; ++ type <= HMAT_LB_DATA_WRITE_BANDWIDTH; type++) { ++ hmat_lb = numa_state->hmat_lb[hierarchy][type]; ++ ++ if (hmat_lb && hmat_lb->list->len) { ++ build_hmat_lb(table_data, hmat_lb, num_initiator, ++ numa_state->num_nodes, initiator_list); ++ } ++ } ++ } + } + + void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) +-- +1.8.3.1 + diff --git a/kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch b/kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch new file mode 100755 index 0000000..f01dec2 --- /dev/null +++ b/kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch @@ -0,0 +1,100 @@ +From cebc614e5ddd1f770c4d6dc26c066791f36e56df Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:02 +0000 +Subject: [PATCH 05/18] hmp: Allow using qdev ID for qemu-io command + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-5-kwolf@redhat.com> +Patchwork-id: 93750 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 4/6] hmp: Allow using qdev ID for qemu-io command +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +In order to issue requests on an existing BlockBackend with the +'qemu-io' HMP command, allow specifying the BlockBackend not only with a +BlockBackend name, but also with a qdev ID/QOM path for a device that +owns the (possibly anonymous) BlockBackend. + +Because qdev names could be conflicting with BlockBackend and node +names, introduce a -d option to explicitly address a device. If the +option is not given, a BlockBackend or a node is addressed. + +Signed-off-by: Kevin Wolf +(cherry picked from commit 89b6fc45614bb45dcd58f1590415afe5c2791abd) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + hmp-commands.hx | 8 +++++--- + monitor/hmp-cmds.c | 28 ++++++++++++++++++---------- + 2 files changed, 23 insertions(+), 13 deletions(-) + +diff --git a/hmp-commands.hx b/hmp-commands.hx +index cfcc044..dc23185 100644 +--- a/hmp-commands.hx ++++ b/hmp-commands.hx +@@ -1875,9 +1875,11 @@ ETEXI + + { + .name = "qemu-io", +- .args_type = "device:B,command:s", +- .params = "[device] \"[command]\"", +- .help = "run a qemu-io command on a block device", ++ .args_type = "qdev:-d,device:B,command:s", ++ .params = "[-d] [device] \"[command]\"", ++ .help = "run a qemu-io command on a block device\n\t\t\t" ++ "-d: [device] is a device ID rather than a " ++ "drive ID or node name", + .cmd = hmp_qemu_io, + }, + +diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c +index b2551c1..5f8941d 100644 +--- a/monitor/hmp-cmds.c ++++ b/monitor/hmp-cmds.c +@@ -2468,23 +2468,31 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) + { + BlockBackend *blk; + BlockBackend *local_blk = NULL; ++ bool qdev = qdict_get_try_bool(qdict, "qdev", false); + const char* device = qdict_get_str(qdict, "device"); + const char* command = qdict_get_str(qdict, "command"); + Error *err = NULL; + int ret; + +- blk = blk_by_name(device); +- if (!blk) { +- BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err); +- if (bs) { +- blk = local_blk = blk_new(bdrv_get_aio_context(bs), +- 0, BLK_PERM_ALL); +- ret = blk_insert_bs(blk, bs, &err); +- if (ret < 0) { ++ if (qdev) { ++ blk = blk_by_qdev_id(device, &err); ++ if (!blk) { ++ goto fail; ++ } ++ } else { ++ blk = blk_by_name(device); ++ if (!blk) { ++ BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err); ++ if (bs) { ++ blk = local_blk = blk_new(bdrv_get_aio_context(bs), ++ 0, BLK_PERM_ALL); ++ ret = blk_insert_bs(blk, bs, &err); ++ if (ret < 0) { ++ goto fail; ++ } ++ } else { + goto fail; + } +- } else { +- goto fail; + } + } + +-- +1.8.3.1 + diff --git a/kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch b/kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch new file mode 100755 index 0000000..75788c5 --- /dev/null +++ b/kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch @@ -0,0 +1,222 @@ +From 602f17920e422e2b8d3ce485e56066a97b74e723 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:29 -0500 +Subject: [PATCH 05/17] hw/arm/smmu: Introduce SMMUTLBEntry for PTW and IOTLB + value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-5-eperezma@redhat.com> +Patchwork-id: 100597 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 04/13] hw/arm/smmu: Introduce SMMUTLBEntry for PTW and IOTLB value +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Introduce a specialized SMMUTLBEntry to store the result of +the PTW and cache in the IOTLB. This structure extends the +generic IOMMUTLBEntry struct with the level of the entry and +the granule size. + +Those latter will be useful when implementing range invalidation. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-5-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit a7550158556b7fc2f2baaecf9092499c6687b160) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 32 +++++++++++++++++--------------- + hw/arm/smmuv3.c | 10 +++++----- + include/hw/arm/smmu-common.h | 12 +++++++++--- + 3 files changed, 31 insertions(+), 23 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 0b89c9fbbbc..06e9e38b007 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -64,11 +64,11 @@ SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova) + return key; + } + +-IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, +- hwaddr iova) ++SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, ++ hwaddr iova) + { + SMMUIOTLBKey key = smmu_get_iotlb_key(cfg->asid, iova); +- IOMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); ++ SMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); + + if (entry) { + cfg->iotlb_hits++; +@@ -86,7 +86,7 @@ IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + return entry; + } + +-void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) + { + SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); + +@@ -94,9 +94,9 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) + smmu_iotlb_inv_all(bs); + } + +- *key = smmu_get_iotlb_key(cfg->asid, entry->iova); +- trace_smmu_iotlb_insert(cfg->asid, entry->iova); +- g_hash_table_insert(bs->iotlb, key, entry); ++ *key = smmu_get_iotlb_key(cfg->asid, new->entry.iova); ++ trace_smmu_iotlb_insert(cfg->asid, new->entry.iova); ++ g_hash_table_insert(bs->iotlb, key, new); + } + + inline void smmu_iotlb_inv_all(SMMUState *s) +@@ -217,7 +217,7 @@ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova) + * @cfg: translation config + * @iova: iova to translate + * @perm: access type +- * @tlbe: IOMMUTLBEntry (out) ++ * @tlbe: SMMUTLBEntry (out) + * @info: handle to an error info + * + * Return 0 on success, < 0 on error. In case of error, @info is filled +@@ -227,7 +227,7 @@ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova) + */ + static int smmu_ptw_64(SMMUTransCfg *cfg, + dma_addr_t iova, IOMMUAccessFlags perm, +- IOMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) ++ SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) + { + dma_addr_t baseaddr, indexmask; + int stage = cfg->stage; +@@ -247,8 +247,8 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + baseaddr = extract64(tt->ttb, 0, 48); + baseaddr &= ~indexmask; + +- tlbe->iova = iova; +- tlbe->addr_mask = (1 << granule_sz) - 1; ++ tlbe->entry.iova = iova; ++ tlbe->entry.addr_mask = (1 << granule_sz) - 1; + + while (level <= 3) { + uint64_t subpage_size = 1ULL << level_shift(level, granule_sz); +@@ -299,14 +299,16 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + goto error; + } + +- tlbe->translated_addr = gpa + (iova & mask); +- tlbe->perm = PTE_AP_TO_PERM(ap); ++ tlbe->entry.translated_addr = gpa + (iova & mask); ++ tlbe->entry.perm = PTE_AP_TO_PERM(ap); ++ tlbe->level = level; ++ tlbe->granule = granule_sz; + return 0; + } + info->type = SMMU_PTW_ERR_TRANSLATION; + + error: +- tlbe->perm = IOMMU_NONE; ++ tlbe->entry.perm = IOMMU_NONE; + return -EINVAL; + } + +@@ -322,7 +324,7 @@ error: + * return 0 on success + */ + inline int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, +- IOMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) ++ SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) + { + if (!cfg->aa64) { + /* +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 34dea4df4da..ad8212779d3 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -614,7 +614,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + SMMUTranslationStatus status; + SMMUState *bs = ARM_SMMU(s); + uint64_t page_mask, aligned_addr; +- IOMMUTLBEntry *cached_entry = NULL; ++ SMMUTLBEntry *cached_entry = NULL; + SMMUTransTableInfo *tt; + SMMUTransCfg *cfg = NULL; + IOMMUTLBEntry entry = { +@@ -664,7 +664,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + + cached_entry = smmu_iotlb_lookup(bs, cfg, aligned_addr); + if (cached_entry) { +- if ((flag & IOMMU_WO) && !(cached_entry->perm & IOMMU_WO)) { ++ if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) { + status = SMMU_TRANS_ERROR; + if (event.record_trans_faults) { + event.type = SMMU_EVT_F_PERMISSION; +@@ -677,7 +677,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + goto epilogue; + } + +- cached_entry = g_new0(IOMMUTLBEntry, 1); ++ cached_entry = g_new0(SMMUTLBEntry, 1); + + if (smmu_ptw(cfg, aligned_addr, flag, cached_entry, &ptw_info)) { + g_free(cached_entry); +@@ -731,9 +731,9 @@ epilogue: + switch (status) { + case SMMU_TRANS_SUCCESS: + entry.perm = flag; +- entry.translated_addr = cached_entry->translated_addr + ++ entry.translated_addr = cached_entry->entry.translated_addr + + (addr & page_mask); +- entry.addr_mask = cached_entry->addr_mask; ++ entry.addr_mask = cached_entry->entry.addr_mask; + trace_smmuv3_translate_success(mr->parent_obj.name, sid, addr, + entry.translated_addr, entry.perm); + break; +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index bceba40885c..277923bdc0a 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -52,6 +52,12 @@ typedef struct SMMUTransTableInfo { + uint8_t granule_sz; /* granule page shift */ + } SMMUTransTableInfo; + ++typedef struct SMMUTLBEntry { ++ IOMMUTLBEntry entry; ++ uint8_t level; ++ uint8_t granule; ++} SMMUTLBEntry; ++ + /* + * Generic structure populated by derived SMMU devices + * after decoding the configuration information and used as +@@ -140,7 +146,7 @@ static inline uint16_t smmu_get_sid(SMMUDevice *sdev) + * pair, according to @cfg translation config + */ + int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, +- IOMMUTLBEntry *tlbe, SMMUPTWEventInfo *info); ++ SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info); + + /** + * select_tt - compute which translation table shall be used according to +@@ -153,8 +159,8 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + +-IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); +-void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry); ++SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *entry); + SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch b/kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch new file mode 100755 index 0000000..6500b41 --- /dev/null +++ b/kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch @@ -0,0 +1,166 @@ +From 7833c0bf8321cb39614ee889cf3e3a64511c0aa5 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:28 -0500 +Subject: [PATCH 04/17] hw/arm/smmu: Introduce smmu_get_iotlb_key() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-4-eperezma@redhat.com> +Patchwork-id: 100596 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 03/13] hw/arm/smmu: Introduce smmu_get_iotlb_key() +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Introduce the smmu_get_iotlb_key() helper and the +SMMU_IOTLB_ASID() macro. Also move smmu_get_iotlb_key and +smmu_iotlb_key_hash in the IOTLB related code section. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-4-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 60a61f1b31fc03080aadb63c9b1006f8b1972adb) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 66 ++++++++++++++++++++---------------- + hw/arm/smmu-internal.h | 1 + + include/hw/arm/smmu-common.h | 1 + + 3 files changed, 38 insertions(+), 30 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 8e01505dbee..0b89c9fbbbc 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -32,10 +32,42 @@ + + /* IOTLB Management */ + ++static guint smmu_iotlb_key_hash(gconstpointer v) ++{ ++ SMMUIOTLBKey *key = (SMMUIOTLBKey *)v; ++ uint32_t a, b, c; ++ ++ /* Jenkins hash */ ++ a = b = c = JHASH_INITVAL + sizeof(*key); ++ a += key->asid; ++ b += extract64(key->iova, 0, 32); ++ c += extract64(key->iova, 32, 32); ++ ++ __jhash_mix(a, b, c); ++ __jhash_final(a, b, c); ++ ++ return c; ++} ++ ++static gboolean smmu_iotlb_key_equal(gconstpointer v1, gconstpointer v2) ++{ ++ const SMMUIOTLBKey *k1 = v1; ++ const SMMUIOTLBKey *k2 = v2; ++ ++ return (k1->asid == k2->asid) && (k1->iova == k2->iova); ++} ++ ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova) ++{ ++ SMMUIOTLBKey key = {.asid = asid, .iova = iova}; ++ ++ return key; ++} ++ + IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + hwaddr iova) + { +- SMMUIOTLBKey key = {.asid = cfg->asid, .iova = iova}; ++ SMMUIOTLBKey key = smmu_get_iotlb_key(cfg->asid, iova); + IOMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); + + if (entry) { +@@ -62,8 +94,7 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) + smmu_iotlb_inv_all(bs); + } + +- key->asid = cfg->asid; +- key->iova = entry->iova; ++ *key = smmu_get_iotlb_key(cfg->asid, entry->iova); + trace_smmu_iotlb_insert(cfg->asid, entry->iova); + g_hash_table_insert(bs->iotlb, key, entry); + } +@@ -80,12 +111,12 @@ static gboolean smmu_hash_remove_by_asid(gpointer key, gpointer value, + uint16_t asid = *(uint16_t *)user_data; + SMMUIOTLBKey *iotlb_key = (SMMUIOTLBKey *)key; + +- return iotlb_key->asid == asid; ++ return SMMU_IOTLB_ASID(*iotlb_key) == asid; + } + + inline void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova) + { +- SMMUIOTLBKey key = {.asid = asid, .iova = iova}; ++ SMMUIOTLBKey key = smmu_get_iotlb_key(asid, iova); + + trace_smmu_iotlb_inv_iova(asid, iova); + g_hash_table_remove(s->iotlb, &key); +@@ -382,31 +413,6 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + return NULL; + } + +-static guint smmu_iotlb_key_hash(gconstpointer v) +-{ +- SMMUIOTLBKey *key = (SMMUIOTLBKey *)v; +- uint32_t a, b, c; +- +- /* Jenkins hash */ +- a = b = c = JHASH_INITVAL + sizeof(*key); +- a += key->asid; +- b += extract64(key->iova, 0, 32); +- c += extract64(key->iova, 32, 32); +- +- __jhash_mix(a, b, c); +- __jhash_final(a, b, c); +- +- return c; +-} +- +-static gboolean smmu_iotlb_key_equal(gconstpointer v1, gconstpointer v2) +-{ +- const SMMUIOTLBKey *k1 = v1; +- const SMMUIOTLBKey *k2 = v2; +- +- return (k1->asid == k2->asid) && (k1->iova == k2->iova); +-} +- + /* Unmap the whole notifier's range */ + static void smmu_unmap_notifier_range(IOMMUNotifier *n) + { +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 7794d6d3947..3104f768cd2 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -96,4 +96,5 @@ uint64_t iova_level_offset(uint64_t iova, int inputsize, + MAKE_64BIT_MASK(0, gsz - 3); + } + ++#define SMMU_IOTLB_ASID(key) ((key).asid) + #endif +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index a28650c9350..bceba40885c 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -155,6 +155,7 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); + void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry); ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); + void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch b/kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch new file mode 100755 index 0000000..ebe3d15 --- /dev/null +++ b/kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch @@ -0,0 +1,181 @@ +From fbfa584e58a560f27081043ad8e90ee9022421c0 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:27 -0500 +Subject: [PATCH 03/17] hw/arm/smmu-common: Add IOTLB helpers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-3-eperezma@redhat.com> +Patchwork-id: 100595 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 02/13] hw/arm/smmu-common: Add IOTLB helpers +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Add two helpers: one to lookup for a given IOTLB entry and +one to insert a new entry. We also move the tracing there. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-3-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 6808bca939b8722d98165319ba42366ca80de907) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 36 ++++++++++++++++++++++++++++++++++++ + hw/arm/smmuv3.c | 26 ++------------------------ + hw/arm/trace-events | 5 +++-- + include/hw/arm/smmu-common.h | 2 ++ + 4 files changed, 43 insertions(+), 26 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index d2ba8b224ba..8e01505dbee 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -32,6 +32,42 @@ + + /* IOTLB Management */ + ++IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, ++ hwaddr iova) ++{ ++ SMMUIOTLBKey key = {.asid = cfg->asid, .iova = iova}; ++ IOMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); ++ ++ if (entry) { ++ cfg->iotlb_hits++; ++ trace_smmu_iotlb_lookup_hit(cfg->asid, iova, ++ cfg->iotlb_hits, cfg->iotlb_misses, ++ 100 * cfg->iotlb_hits / ++ (cfg->iotlb_hits + cfg->iotlb_misses)); ++ } else { ++ cfg->iotlb_misses++; ++ trace_smmu_iotlb_lookup_miss(cfg->asid, iova, ++ cfg->iotlb_hits, cfg->iotlb_misses, ++ 100 * cfg->iotlb_hits / ++ (cfg->iotlb_hits + cfg->iotlb_misses)); ++ } ++ return entry; ++} ++ ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) ++{ ++ SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); ++ ++ if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { ++ smmu_iotlb_inv_all(bs); ++ } ++ ++ key->asid = cfg->asid; ++ key->iova = entry->iova; ++ trace_smmu_iotlb_insert(cfg->asid, entry->iova); ++ g_hash_table_insert(bs->iotlb, key, entry); ++} ++ + inline void smmu_iotlb_inv_all(SMMUState *s) + { + trace_smmu_iotlb_inv_all(); +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index e2fbb8357ea..34dea4df4da 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -624,7 +624,6 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + .addr_mask = ~(hwaddr)0, + .perm = IOMMU_NONE, + }; +- SMMUIOTLBKey key, *new_key; + + qemu_mutex_lock(&s->mutex); + +@@ -663,16 +662,8 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + page_mask = (1ULL << (tt->granule_sz)) - 1; + aligned_addr = addr & ~page_mask; + +- key.asid = cfg->asid; +- key.iova = aligned_addr; +- +- cached_entry = g_hash_table_lookup(bs->iotlb, &key); ++ cached_entry = smmu_iotlb_lookup(bs, cfg, aligned_addr); + if (cached_entry) { +- cfg->iotlb_hits++; +- trace_smmu_iotlb_cache_hit(cfg->asid, aligned_addr, +- cfg->iotlb_hits, cfg->iotlb_misses, +- 100 * cfg->iotlb_hits / +- (cfg->iotlb_hits + cfg->iotlb_misses)); + if ((flag & IOMMU_WO) && !(cached_entry->perm & IOMMU_WO)) { + status = SMMU_TRANS_ERROR; + if (event.record_trans_faults) { +@@ -686,16 +677,6 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + goto epilogue; + } + +- cfg->iotlb_misses++; +- trace_smmu_iotlb_cache_miss(cfg->asid, addr & ~page_mask, +- cfg->iotlb_hits, cfg->iotlb_misses, +- 100 * cfg->iotlb_hits / +- (cfg->iotlb_hits + cfg->iotlb_misses)); +- +- if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { +- smmu_iotlb_inv_all(bs); +- } +- + cached_entry = g_new0(IOMMUTLBEntry, 1); + + if (smmu_ptw(cfg, aligned_addr, flag, cached_entry, &ptw_info)) { +@@ -741,10 +722,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + } + status = SMMU_TRANS_ERROR; + } else { +- new_key = g_new0(SMMUIOTLBKey, 1); +- new_key->asid = cfg->asid; +- new_key->iova = aligned_addr; +- g_hash_table_insert(bs->iotlb, new_key, cached_entry); ++ smmu_iotlb_insert(bs, cfg, cached_entry); + status = SMMU_TRANS_SUCCESS; + } + +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 0acedcedc6f..b808a1bfc19 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -14,6 +14,9 @@ smmu_iotlb_inv_all(void) "IOTLB invalidate all" + smmu_iotlb_inv_asid(uint16_t asid) "IOTLB invalidate asid=%d" + smmu_iotlb_inv_iova(uint16_t asid, uint64_t addr) "IOTLB invalidate asid=%d addr=0x%"PRIx64 + smmu_inv_notifiers_mr(const char *name) "iommu mr=%s" ++smmu_iotlb_lookup_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" ++smmu_iotlb_lookup_miss(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" ++smmu_iotlb_insert(uint16_t asid, uint64_t addr) "IOTLB ++ asid=%d addr=0x%"PRIx64 + + # smmuv3.c + smmuv3_read_mmio(uint64_t addr, uint64_t val, unsigned size, uint32_t r) "addr: 0x%"PRIx64" val:0x%"PRIx64" size: 0x%x(%d)" +@@ -46,8 +49,6 @@ smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d a + smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64 + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" +-smmu_iotlb_cache_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" +-smmu_iotlb_cache_miss(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 1f37844e5c9..a28650c9350 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -153,6 +153,8 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + ++IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); + void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch b/kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch new file mode 100755 index 0000000..d973b13 --- /dev/null +++ b/kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch @@ -0,0 +1,124 @@ +From 79718d8c67c9c54fa86a77f66aa8784aca7651d5 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:26 -0500 +Subject: [PATCH 02/17] hw/arm/smmu-common: Factorize some code in + smmu_ptw_64() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-2-eperezma@redhat.com> +Patchwork-id: 100594 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 01/13] hw/arm/smmu-common: Factorize some code in smmu_ptw_64() +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Page and block PTE decoding can share some code. Let's +first handle table PTE and factorize some code shared by +page and block PTEs. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-2-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 1733837d7cdb207653a849a5f1fa78de878c6ac1) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 48 ++++++++++++++++---------------------------- + 1 file changed, 17 insertions(+), 31 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 245817d23e9..d2ba8b224ba 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -187,7 +187,7 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + uint64_t subpage_size = 1ULL << level_shift(level, granule_sz); + uint64_t mask = subpage_size - 1; + uint32_t offset = iova_level_offset(iova, inputsize, level, granule_sz); +- uint64_t pte; ++ uint64_t pte, gpa; + dma_addr_t pte_addr = baseaddr + offset * sizeof(pte); + uint8_t ap; + +@@ -200,56 +200,42 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + if (is_invalid_pte(pte) || is_reserved_pte(pte, level)) { + trace_smmu_ptw_invalid_pte(stage, level, baseaddr, + pte_addr, offset, pte); +- info->type = SMMU_PTW_ERR_TRANSLATION; +- goto error; ++ break; + } + +- if (is_page_pte(pte, level)) { +- uint64_t gpa = get_page_pte_address(pte, granule_sz); ++ if (is_table_pte(pte, level)) { ++ ap = PTE_APTABLE(pte); + +- ap = PTE_AP(pte); + if (is_permission_fault(ap, perm)) { + info->type = SMMU_PTW_ERR_PERMISSION; + goto error; + } +- +- tlbe->translated_addr = gpa + (iova & mask); +- tlbe->perm = PTE_AP_TO_PERM(ap); ++ baseaddr = get_table_pte_address(pte, granule_sz); ++ level++; ++ continue; ++ } else if (is_page_pte(pte, level)) { ++ gpa = get_page_pte_address(pte, granule_sz); + trace_smmu_ptw_page_pte(stage, level, iova, + baseaddr, pte_addr, pte, gpa); +- return 0; +- } +- if (is_block_pte(pte, level)) { ++ } else { + uint64_t block_size; +- hwaddr gpa = get_block_pte_address(pte, level, granule_sz, +- &block_size); +- +- ap = PTE_AP(pte); +- if (is_permission_fault(ap, perm)) { +- info->type = SMMU_PTW_ERR_PERMISSION; +- goto error; +- } + ++ gpa = get_block_pte_address(pte, level, granule_sz, ++ &block_size); + trace_smmu_ptw_block_pte(stage, level, baseaddr, + pte_addr, pte, iova, gpa, + block_size >> 20); +- +- tlbe->translated_addr = gpa + (iova & mask); +- tlbe->perm = PTE_AP_TO_PERM(ap); +- return 0; + } +- +- /* table pte */ +- ap = PTE_APTABLE(pte); +- ++ ap = PTE_AP(pte); + if (is_permission_fault(ap, perm)) { + info->type = SMMU_PTW_ERR_PERMISSION; + goto error; + } +- baseaddr = get_table_pte_address(pte, granule_sz); +- level++; +- } + ++ tlbe->translated_addr = gpa + (iova & mask); ++ tlbe->perm = PTE_AP_TO_PERM(ap); ++ return 0; ++ } + info->type = SMMU_PTW_ERR_TRANSLATION; + + error: +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch b/kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch new file mode 100755 index 0000000..e118225 --- /dev/null +++ b/kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch @@ -0,0 +1,274 @@ +From 4770f43dab482e4585d3555933a473cf24e796db Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:30 -0500 +Subject: [PATCH 06/17] hw/arm/smmu-common: Manage IOTLB block entries +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-6-eperezma@redhat.com> +Patchwork-id: 100598 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 05/13] hw/arm/smmu-common: Manage IOTLB block entries +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +At the moment each entry in the IOTLB corresponds to a page sized +mapping (4K, 16K or 64K), even if the page belongs to a mapped +block. In case of block mapping this unefficiently consumes IOTLB +entries. + +Change the value of the entry so that it reflects the actual +mapping it belongs to (block or page start address and size). + +Also the level/tg of the entry is encoded in the key. In subsequent +patches we will enable range invalidation. This latter is able +to provide the level/tg of the entry. + +Encoding the level/tg directly in the key will allow to invalidate +using g_hash_table_remove() when num_pages equals to 1. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-6-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 9e54dee71fcfaae69f87b8e1f51485a832266a39) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 67 ++++++++++++++++++++++++++---------- + hw/arm/smmu-internal.h | 7 ++++ + hw/arm/smmuv3.c | 6 ++-- + hw/arm/trace-events | 2 +- + include/hw/arm/smmu-common.h | 10 ++++-- + 5 files changed, 67 insertions(+), 25 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 06e9e38b007..8007edeaaa2 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -39,7 +39,7 @@ static guint smmu_iotlb_key_hash(gconstpointer v) + + /* Jenkins hash */ + a = b = c = JHASH_INITVAL + sizeof(*key); +- a += key->asid; ++ a += key->asid + key->level + key->tg; + b += extract64(key->iova, 0, 32); + c += extract64(key->iova, 32, 32); + +@@ -51,24 +51,41 @@ static guint smmu_iotlb_key_hash(gconstpointer v) + + static gboolean smmu_iotlb_key_equal(gconstpointer v1, gconstpointer v2) + { +- const SMMUIOTLBKey *k1 = v1; +- const SMMUIOTLBKey *k2 = v2; ++ SMMUIOTLBKey *k1 = (SMMUIOTLBKey *)v1, *k2 = (SMMUIOTLBKey *)v2; + +- return (k1->asid == k2->asid) && (k1->iova == k2->iova); ++ return (k1->asid == k2->asid) && (k1->iova == k2->iova) && ++ (k1->level == k2->level) && (k1->tg == k2->tg); + } + +-SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova) ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova, ++ uint8_t tg, uint8_t level) + { +- SMMUIOTLBKey key = {.asid = asid, .iova = iova}; ++ SMMUIOTLBKey key = {.asid = asid, .iova = iova, .tg = tg, .level = level}; + + return key; + } + + SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, +- hwaddr iova) ++ SMMUTransTableInfo *tt, hwaddr iova) + { +- SMMUIOTLBKey key = smmu_get_iotlb_key(cfg->asid, iova); +- SMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); ++ uint8_t tg = (tt->granule_sz - 10) / 2; ++ uint8_t inputsize = 64 - tt->tsz; ++ uint8_t stride = tt->granule_sz - 3; ++ uint8_t level = 4 - (inputsize - 4) / stride; ++ SMMUTLBEntry *entry = NULL; ++ ++ while (level <= 3) { ++ uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz); ++ uint64_t mask = subpage_size - 1; ++ SMMUIOTLBKey key; ++ ++ key = smmu_get_iotlb_key(cfg->asid, iova & ~mask, tg, level); ++ entry = g_hash_table_lookup(bs->iotlb, &key); ++ if (entry) { ++ break; ++ } ++ level++; ++ } + + if (entry) { + cfg->iotlb_hits++; +@@ -89,13 +106,14 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) + { + SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); ++ uint8_t tg = (new->granule - 10) / 2; + + if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { + smmu_iotlb_inv_all(bs); + } + +- *key = smmu_get_iotlb_key(cfg->asid, new->entry.iova); +- trace_smmu_iotlb_insert(cfg->asid, new->entry.iova); ++ *key = smmu_get_iotlb_key(cfg->asid, new->entry.iova, tg, new->level); ++ trace_smmu_iotlb_insert(cfg->asid, new->entry.iova, tg, new->level); + g_hash_table_insert(bs->iotlb, key, new); + } + +@@ -114,12 +132,26 @@ static gboolean smmu_hash_remove_by_asid(gpointer key, gpointer value, + return SMMU_IOTLB_ASID(*iotlb_key) == asid; + } + +-inline void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova) ++static gboolean smmu_hash_remove_by_asid_iova(gpointer key, gpointer value, ++ gpointer user_data) + { +- SMMUIOTLBKey key = smmu_get_iotlb_key(asid, iova); ++ SMMUTLBEntry *iter = (SMMUTLBEntry *)value; ++ IOMMUTLBEntry *entry = &iter->entry; ++ SMMUIOTLBPageInvInfo *info = (SMMUIOTLBPageInvInfo *)user_data; ++ SMMUIOTLBKey iotlb_key = *(SMMUIOTLBKey *)key; ++ ++ if (info->asid >= 0 && info->asid != SMMU_IOTLB_ASID(iotlb_key)) { ++ return false; ++ } ++ return (info->iova & ~entry->addr_mask) == entry->iova; ++} ++ ++inline void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova) ++{ ++ SMMUIOTLBPageInvInfo info = {.asid = asid, .iova = iova}; + + trace_smmu_iotlb_inv_iova(asid, iova); +- g_hash_table_remove(s->iotlb, &key); ++ g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid_iova, &info); + } + + inline void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid) +@@ -247,9 +279,6 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + baseaddr = extract64(tt->ttb, 0, 48); + baseaddr &= ~indexmask; + +- tlbe->entry.iova = iova; +- tlbe->entry.addr_mask = (1 << granule_sz) - 1; +- + while (level <= 3) { + uint64_t subpage_size = 1ULL << level_shift(level, granule_sz); + uint64_t mask = subpage_size - 1; +@@ -299,7 +328,9 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + goto error; + } + +- tlbe->entry.translated_addr = gpa + (iova & mask); ++ tlbe->entry.translated_addr = gpa; ++ tlbe->entry.iova = iova & ~mask; ++ tlbe->entry.addr_mask = mask; + tlbe->entry.perm = PTE_AP_TO_PERM(ap); + tlbe->level = level; + tlbe->granule = granule_sz; +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 3104f768cd2..55147f29be4 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -97,4 +97,11 @@ uint64_t iova_level_offset(uint64_t iova, int inputsize, + } + + #define SMMU_IOTLB_ASID(key) ((key).asid) ++ ++typedef struct SMMUIOTLBPageInvInfo { ++ int asid; ++ uint64_t iova; ++ uint64_t mask; ++} SMMUIOTLBPageInvInfo; ++ + #endif +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ad8212779d3..067c9480a03 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -662,7 +662,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + page_mask = (1ULL << (tt->granule_sz)) - 1; + aligned_addr = addr & ~page_mask; + +- cached_entry = smmu_iotlb_lookup(bs, cfg, aligned_addr); ++ cached_entry = smmu_iotlb_lookup(bs, cfg, tt, aligned_addr); + if (cached_entry) { + if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) { + status = SMMU_TRANS_ERROR; +@@ -732,7 +732,7 @@ epilogue: + case SMMU_TRANS_SUCCESS: + entry.perm = flag; + entry.translated_addr = cached_entry->entry.translated_addr + +- (addr & page_mask); ++ (addr & cached_entry->entry.addr_mask); + entry.addr_mask = cached_entry->entry.addr_mask; + trace_smmuv3_translate_success(mr->parent_obj.name, sid, addr, + entry.translated_addr, entry.perm); +@@ -960,7 +960,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + + trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr); + smmuv3_inv_notifiers_iova(bs, -1, addr); +- smmu_iotlb_inv_all(bs); ++ smmu_iotlb_inv_iova(bs, -1, addr); + break; + } + case SMMU_CMD_TLBI_NH_VA: +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index b808a1bfc19..f74d3e920f1 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -16,7 +16,7 @@ smmu_iotlb_inv_iova(uint16_t asid, uint64_t addr) "IOTLB invalidate asid=%d addr + smmu_inv_notifiers_mr(const char *name) "iommu mr=%s" + smmu_iotlb_lookup_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" + smmu_iotlb_lookup_miss(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" +-smmu_iotlb_insert(uint16_t asid, uint64_t addr) "IOTLB ++ asid=%d addr=0x%"PRIx64 ++smmu_iotlb_insert(uint16_t asid, uint64_t addr, uint8_t tg, uint8_t level) "IOTLB ++ asid=%d addr=0x%"PRIx64" tg=%d level=%d" + + # smmuv3.c + smmuv3_read_mmio(uint64_t addr, uint64_t val, unsigned size, uint32_t r) "addr: 0x%"PRIx64" val:0x%"PRIx64" size: 0x%x(%d)" +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 277923bdc0a..bbf3abc41fd 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -97,6 +97,8 @@ typedef struct SMMUPciBus { + typedef struct SMMUIOTLBKey { + uint64_t iova; + uint16_t asid; ++ uint8_t tg; ++ uint8_t level; + } SMMUIOTLBKey; + + typedef struct SMMUState { +@@ -159,12 +161,14 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + +-SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); ++SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, ++ SMMUTransTableInfo *tt, hwaddr iova); + void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *entry); +-SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova); ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova, ++ uint8_t tg, uint8_t level); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); +-void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova); ++void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova); + + /* Unmap the range of all the notifiers registered to any IOMMU mr */ + void smmu_inv_notifiers_all(SMMUState *s); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch b/kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch new file mode 100755 index 0000000..79e75d8 --- /dev/null +++ b/kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch @@ -0,0 +1,67 @@ +From 69d71311d3d70282dec3d1f19f9e4b90c7b7c6b9 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:33 -0500 +Subject: [PATCH 09/17] hw/arm/smmuv3: Fix potential integer overflow (CID + 1432363) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-9-eperezma@redhat.com> +Patchwork-id: 100601 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 08/13] hw/arm/smmuv3: Fix potential integer overflow (CID 1432363) +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Philippe Mathieu-Daudé + +Use the BIT_ULL() macro to ensure we use 64-bit arithmetic. +This fixes the following Coverity issue (OVERFLOW_BEFORE_WIDEN): + + CID 1432363 (#1 of 1): Unintentional integer overflow: + + overflow_before_widen: + Potentially overflowing expression 1 << scale with type int + (32 bits, signed) is evaluated using 32-bit arithmetic, and + then used in a context that expects an expression of type + hwaddr (64 bits, unsigned). + +Signed-off-by: Philippe Mathieu-Daudé +Acked-by: Eric Auger +Message-id: 20201030144617.1535064-1-philmd@redhat.com +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell +(cherry picked from commit 744a790ec01a30033309e6a2155df4d61061e184) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmuv3.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index f4d5d9d8222..a418fab2aa6 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -17,6 +17,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/bitops.h" + #include "hw/irq.h" + #include "hw/sysbus.h" + #include "migration/vmstate.h" +@@ -847,7 +848,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + scale = CMD_SCALE(cmd); + num = CMD_NUM(cmd); + ttl = CMD_TTL(cmd); +- num_pages = (num + 1) * (1 << (scale)); ++ num_pages = (num + 1) * BIT_ULL(scale); + } + + if (type == SMMU_CMD_TLBI_NH_VA) { +-- +2.27.0 + diff --git a/kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch b/kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch new file mode 100755 index 0000000..fd52e0c --- /dev/null +++ b/kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch @@ -0,0 +1,255 @@ +From 3f027ac56449e51a61e76c18b97fd341d302dc80 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:32 -0500 +Subject: [PATCH 08/17] hw/arm/smmuv3: Get prepared for range invalidation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-8-eperezma@redhat.com> +Patchwork-id: 100600 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 07/13] hw/arm/smmuv3: Get prepared for range invalidation +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Enhance the smmu_iotlb_inv_iova() helper with range invalidation. +This uses the new fields passed in the NH_VA and NH_VAA commands: +the size of the range, the level and the granule. + +As NH_VA and NH_VAA both use those fields, their decoding and +handling is factorized in a new smmuv3_s1_range_inval() helper. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-8-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit d52915616c059ed273caa2d496b58e5d215c5962) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 25 +++++++++++--- + hw/arm/smmuv3-internal.h | 4 +++ + hw/arm/smmuv3.c | 64 +++++++++++++++++++++++------------- + hw/arm/trace-events | 4 +-- + include/hw/arm/smmu-common.h | 3 +- + 5 files changed, 69 insertions(+), 31 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 8007edeaaa2..9780404f002 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -143,15 +143,30 @@ static gboolean smmu_hash_remove_by_asid_iova(gpointer key, gpointer value, + if (info->asid >= 0 && info->asid != SMMU_IOTLB_ASID(iotlb_key)) { + return false; + } +- return (info->iova & ~entry->addr_mask) == entry->iova; ++ return ((info->iova & ~entry->addr_mask) == entry->iova) || ++ ((entry->iova & ~info->mask) == info->iova); + } + +-inline void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova) ++inline void ++smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages, uint8_t ttl) + { +- SMMUIOTLBPageInvInfo info = {.asid = asid, .iova = iova}; ++ if (ttl && (num_pages == 1)) { ++ SMMUIOTLBKey key = smmu_get_iotlb_key(asid, iova, tg, ttl); + +- trace_smmu_iotlb_inv_iova(asid, iova); +- g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid_iova, &info); ++ g_hash_table_remove(s->iotlb, &key); ++ } else { ++ /* if tg is not set we use 4KB range invalidation */ ++ uint8_t granule = tg ? tg * 2 + 10 : 12; ++ ++ SMMUIOTLBPageInvInfo info = { ++ .asid = asid, .iova = iova, ++ .mask = (num_pages * 1 << granule) - 1}; ++ ++ g_hash_table_foreach_remove(s->iotlb, ++ smmu_hash_remove_by_asid_iova, ++ &info); ++ } + } + + inline void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid) +diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h +index d190181ef1b..a4ec2c591cd 100644 +--- a/hw/arm/smmuv3-internal.h ++++ b/hw/arm/smmuv3-internal.h +@@ -298,6 +298,8 @@ enum { /* Command completion notification */ + }; + + #define CMD_TYPE(x) extract32((x)->word[0], 0 , 8) ++#define CMD_NUM(x) extract32((x)->word[0], 12 , 5) ++#define CMD_SCALE(x) extract32((x)->word[0], 20 , 5) + #define CMD_SSEC(x) extract32((x)->word[0], 10, 1) + #define CMD_SSV(x) extract32((x)->word[0], 11, 1) + #define CMD_RESUME_AC(x) extract32((x)->word[0], 12, 1) +@@ -310,6 +312,8 @@ enum { /* Command completion notification */ + #define CMD_RESUME_STAG(x) extract32((x)->word[2], 0 , 16) + #define CMD_RESP(x) extract32((x)->word[2], 11, 2) + #define CMD_LEAF(x) extract32((x)->word[2], 0 , 1) ++#define CMD_TTL(x) extract32((x)->word[2], 8 , 2) ++#define CMD_TG(x) extract32((x)->word[2], 10, 2) + #define CMD_STE_RANGE(x) extract32((x)->word[2], 0 , 5) + #define CMD_ADDR(x) ({ \ + uint64_t high = (uint64_t)(x)->word[3]; \ +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ae2b769f891..f4d5d9d8222 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -773,42 +773,49 @@ epilogue: + * @n: notifier to be called + * @asid: address space ID or negative value if we don't care + * @iova: iova ++ * @tg: translation granule (if communicated through range invalidation) ++ * @num_pages: number of @granule sized pages (if tg != 0), otherwise 1 + */ + static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + IOMMUNotifier *n, +- int asid, +- dma_addr_t iova) ++ int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); +- SMMUEventInfo event = {.inval_ste_allowed = true}; +- SMMUTransTableInfo *tt; +- SMMUTransCfg *cfg; + IOMMUTLBEntry entry; ++ uint8_t granule = tg; + +- cfg = smmuv3_get_config(sdev, &event); +- if (!cfg) { +- return; +- } ++ if (!tg) { ++ SMMUEventInfo event = {.inval_ste_allowed = true}; ++ SMMUTransCfg *cfg = smmuv3_get_config(sdev, &event); ++ SMMUTransTableInfo *tt; + +- if (asid >= 0 && cfg->asid != asid) { +- return; +- } ++ if (!cfg) { ++ return; ++ } + +- tt = select_tt(cfg, iova); +- if (!tt) { +- return; ++ if (asid >= 0 && cfg->asid != asid) { ++ return; ++ } ++ ++ tt = select_tt(cfg, iova); ++ if (!tt) { ++ return; ++ } ++ granule = tt->granule_sz; + } + + entry.target_as = &address_space_memory; + entry.iova = iova; +- entry.addr_mask = (1 << tt->granule_sz) - 1; ++ entry.addr_mask = num_pages * (1 << granule) - 1; + entry.perm = IOMMU_NONE; + + memory_region_notify_one(n, &entry); + } + +-/* invalidate an asid/iova tuple in all mr's */ +-static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) ++/* invalidate an asid/iova range tuple in all mr's */ ++static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev; + +@@ -816,28 +823,39 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) + IOMMUMemoryRegion *mr = &sdev->iommu; + IOMMUNotifier *n; + +- trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova); ++ trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova, ++ tg, num_pages); + + IOMMU_NOTIFIER_FOREACH(n, mr) { +- smmuv3_notify_iova(mr, n, asid, iova); ++ smmuv3_notify_iova(mr, n, asid, iova, tg, num_pages); + } + } + } + + static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + { ++ uint8_t scale = 0, num = 0, ttl = 0; + dma_addr_t addr = CMD_ADDR(cmd); + uint8_t type = CMD_TYPE(cmd); + uint16_t vmid = CMD_VMID(cmd); + bool leaf = CMD_LEAF(cmd); ++ uint8_t tg = CMD_TG(cmd); ++ hwaddr num_pages = 1; + int asid = -1; + ++ if (tg) { ++ scale = CMD_SCALE(cmd); ++ num = CMD_NUM(cmd); ++ ttl = CMD_TTL(cmd); ++ num_pages = (num + 1) * (1 << (scale)); ++ } ++ + if (type == SMMU_CMD_TLBI_NH_VA) { + asid = CMD_ASID(cmd); + } +- trace_smmuv3_s1_range_inval(vmid, asid, addr, leaf); +- smmuv3_inv_notifiers_iova(s, asid, addr); +- smmu_iotlb_inv_iova(s, asid, addr); ++ trace_smmuv3_s1_range_inval(vmid, asid, addr, tg, num_pages, ttl, leaf); ++ smmuv3_inv_notifiers_iova(s, asid, addr, tg, num_pages); ++ smmu_iotlb_inv_iova(s, asid, addr, tg, num_pages, ttl); + } + + static int smmuv3_cmdq_consume(SMMUv3State *s) +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index c219fe9e828..3d905e0f7d0 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -45,11 +45,11 @@ smmuv3_cmdq_cfgi_ste_range(int start, int end) "start=0x%d - end=0x%d" + smmuv3_cmdq_cfgi_cd(uint32_t sid) "streamid = %d" + smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache HIT for sid %d (hits=%d, misses=%d, hit rate=%d)" + smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)" +-smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" ++smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, uint8_t tg, uint64_t num_pages, uint8_t ttl, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" ttl=%d leaf=%d" + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +-smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova) "iommu mr=%s asid=%d iova=0x%"PRIx64 ++smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 + +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index bbf3abc41fd..13489a1ac0d 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -168,7 +168,8 @@ SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova, + uint8_t tg, uint8_t level); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); +-void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova); ++void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages, uint8_t ttl); + + /* Unmap the range of all the notifiers registered to any IOMMU mr */ + void smmu_inv_notifiers_all(SMMUState *s); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch b/kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch new file mode 100755 index 0000000..e77c403 --- /dev/null +++ b/kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch @@ -0,0 +1,115 @@ +From c4ae2dbb8ee406f0a015b35fb76b3d6d131900d6 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:31 -0500 +Subject: [PATCH 07/17] hw/arm/smmuv3: Introduce smmuv3_s1_range_inval() helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-7-eperezma@redhat.com> +Patchwork-id: 100599 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 06/13] hw/arm/smmuv3: Introduce smmuv3_s1_range_inval() helper +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Let's introduce an helper for S1 IOVA range invalidation. +This will be used for NH_VA and NH_VAA commands. It decodes +the same fields, trace, calls the UNMAP notifiers and +invalidate the corresponding IOTLB entries. + +At the moment, we do not support 3.2 range invalidation yet. +So it reduces to a single IOVA invalidation. + +Note the leaf bit now is also decoded for the CMD_TLBI_NH_VAA +command. At the moment it is only used for tracing. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-7-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit c0f9ef70377cfcbd0fa6559d5dc729a930d71b7c) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmuv3.c | 36 +++++++++++++++++------------------- + hw/arm/trace-events | 3 +-- + 2 files changed, 18 insertions(+), 21 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 067c9480a03..ae2b769f891 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -824,6 +824,22 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) + } + } + ++static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) ++{ ++ dma_addr_t addr = CMD_ADDR(cmd); ++ uint8_t type = CMD_TYPE(cmd); ++ uint16_t vmid = CMD_VMID(cmd); ++ bool leaf = CMD_LEAF(cmd); ++ int asid = -1; ++ ++ if (type == SMMU_CMD_TLBI_NH_VA) { ++ asid = CMD_ASID(cmd); ++ } ++ trace_smmuv3_s1_range_inval(vmid, asid, addr, leaf); ++ smmuv3_inv_notifiers_iova(s, asid, addr); ++ smmu_iotlb_inv_iova(s, asid, addr); ++} ++ + static int smmuv3_cmdq_consume(SMMUv3State *s) + { + SMMUState *bs = ARM_SMMU(s); +@@ -954,27 +970,9 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + smmu_iotlb_inv_all(bs); + break; + case SMMU_CMD_TLBI_NH_VAA: +- { +- dma_addr_t addr = CMD_ADDR(&cmd); +- uint16_t vmid = CMD_VMID(&cmd); +- +- trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr); +- smmuv3_inv_notifiers_iova(bs, -1, addr); +- smmu_iotlb_inv_iova(bs, -1, addr); +- break; +- } + case SMMU_CMD_TLBI_NH_VA: +- { +- uint16_t asid = CMD_ASID(&cmd); +- uint16_t vmid = CMD_VMID(&cmd); +- dma_addr_t addr = CMD_ADDR(&cmd); +- bool leaf = CMD_LEAF(&cmd); +- +- trace_smmuv3_cmdq_tlbi_nh_va(vmid, asid, addr, leaf); +- smmuv3_inv_notifiers_iova(bs, asid, addr); +- smmu_iotlb_inv_iova(bs, asid, addr); ++ smmuv3_s1_range_inval(bs, &cmd); + break; +- } + case SMMU_CMD_TLBI_EL3_ALL: + case SMMU_CMD_TLBI_EL3_VA: + case SMMU_CMD_TLBI_EL2_ALL: +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index f74d3e920f1..c219fe9e828 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -45,8 +45,7 @@ smmuv3_cmdq_cfgi_ste_range(int start, int end) "start=0x%d - end=0x%d" + smmuv3_cmdq_cfgi_cd(uint32_t sid) "streamid = %d" + smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache HIT for sid %d (hits=%d, misses=%d, hit rate=%d)" + smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)" +-smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" +-smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64 ++smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" +-- +2.27.0 + diff --git a/kvm-hw-ehci-check-return-value-of-usb_packet_map.patch b/kvm-hw-ehci-check-return-value-of-usb_packet_map.patch new file mode 100755 index 0000000..3e3ed87 --- /dev/null +++ b/kvm-hw-ehci-check-return-value-of-usb_packet_map.patch @@ -0,0 +1,61 @@ +From 6955223aa15ab6ea53322218ec03fb3dc2b776f8 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 14 Jan 2021 00:07:05 -0500 +Subject: [PATCH 16/17] hw: ehci: check return value of 'usb_packet_map' + +RH-Author: Jon Maloy +Message-id: <20210114000705.945169-2-jmaloy@redhat.com> +Patchwork-id: 100634 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] hw: ehci: check return value of 'usb_packet_map' +Bugzilla: 1898628 +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Li Qiang + +If 'usb_packet_map' fails, we should stop to process the usb +request. + +Signed-off-by: Li Qiang +Message-Id: <20200812161727.29412-1-liq3ea@163.com> +Signed-off-by: Gerd Hoffmann + +(cherry picked from commit 2fdb42d840400d58f2e706ecca82c142b97bcbd6) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-ehci.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c +index 56ab2f457f4..024b1ed6b67 100644 +--- a/hw/usb/hcd-ehci.c ++++ b/hw/usb/hcd-ehci.c +@@ -1374,7 +1374,10 @@ static int ehci_execute(EHCIPacket *p, const char *action) + spd = (p->pid == USB_TOKEN_IN && NLPTR_TBIT(p->qtd.altnext) == 0); + usb_packet_setup(&p->packet, p->pid, ep, 0, p->qtdaddr, spd, + (p->qtd.token & QTD_TOKEN_IOC) != 0); +- usb_packet_map(&p->packet, &p->sgl); ++ if (usb_packet_map(&p->packet, &p->sgl)) { ++ qemu_sglist_destroy(&p->sgl); ++ return -1; ++ } + p->async = EHCI_ASYNC_INITIALIZED; + } + +@@ -1453,7 +1456,10 @@ static int ehci_process_itd(EHCIState *ehci, + if (ep && ep->type == USB_ENDPOINT_XFER_ISOC) { + usb_packet_setup(&ehci->ipacket, pid, ep, 0, addr, false, + (itd->transact[i] & ITD_XACT_IOC) != 0); +- usb_packet_map(&ehci->ipacket, &ehci->isgl); ++ if (usb_packet_map(&ehci->ipacket, &ehci->isgl)) { ++ qemu_sglist_destroy(&ehci->isgl); ++ return -1; ++ } + usb_handle_packet(dev, &ehci->ipacket); + usb_packet_unmap(&ehci->ipacket, &ehci->isgl); + } else { +-- +2.27.0 + diff --git a/kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch b/kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch new file mode 100755 index 0000000..650555c --- /dev/null +++ b/kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch @@ -0,0 +1,80 @@ +From dad4f9beaa3fd1eec1e0dd46c3d5cd2f444c0f48 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 20:05:51 -0400 +Subject: [PATCH 1/7] hw/intc/arm_gic: Fix interrupt ID in GICD_SGIR register +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210413200551.3825495-2-jmaloy@redhat.com> +Patchwork-id: 101471 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] hw/intc/arm_gic: Fix interrupt ID in GICD_SGIR register +Bugzilla: 1925430 +RH-Acked-by: Andrew Jones +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Philippe Mathieu-Daudé + +Per the ARM Generic Interrupt Controller Architecture specification +(document "ARM IHI 0048B.b (ID072613)"), the SGIINTID field is 4 bit, +not 10: + + - 4.3 Distributor register descriptions + - 4.3.15 Software Generated Interrupt Register, GICD_SG + + - Table 4-21 GICD_SGIR bit assignments + + The Interrupt ID of the SGI to forward to the specified CPU + interfaces. The value of this field is the Interrupt ID, in + the range 0-15, for example a value of 0b0011 specifies + Interrupt ID 3. + +Correct the irq mask to fix an undefined behavior (which eventually +lead to a heap-buffer-overflow, see [Buglink]): + + $ echo 'writel 0x8000f00 0xff4affb0' | qemu-system-aarch64 -M virt,accel=qtest -qtest stdio + [I 1612088147.116987] OPENED + [R +0.278293] writel 0x8000f00 0xff4affb0 + ../hw/intc/arm_gic.c:1498:13: runtime error: index 944 out of bounds for type 'uint8_t [16][8]' + SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior ../hw/intc/arm_gic.c:1498:13 + +This fixes a security issue when running with KVM on Arm with +kernel-irqchip=off. (The default is kernel-irqchip=on, which is +unaffected, and which is also the correct choice for performance.) + +Cc: qemu-stable@nongnu.org +Fixes: CVE-2021-20221 +Fixes: 9ee6e8bb853 ("ARMv7 support.") +Buglink: https://bugs.launchpad.net/qemu/+bug/1913916 +Buglink: https://bugs.launchpad.net/qemu/+bug/1913917 +Reported-by: Alexander Bulekov +Signed-off-by: Philippe Mathieu-Daudé +Message-id: 20210131103401.217160-1-f4bug@amsat.org +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell + +(cherry picked from commit edfe2eb4360cde4ed5d95bda7777edcb3510f76a) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/arm_gic.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c +index 1d7da7baa2..df355f4d11 100644 +--- a/hw/intc/arm_gic.c ++++ b/hw/intc/arm_gic.c +@@ -1455,7 +1455,7 @@ static void gic_dist_writel(void *opaque, hwaddr offset, + int target_cpu; + + cpu = gic_get_current_cpu(s); +- irq = value & 0x3ff; ++ irq = value & 0xf; + switch ((value >> 24) & 3) { + case 0: + mask = (value >> 16) & ALL_CPU_MASK; +-- +2.27.0 + diff --git a/kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch b/kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch new file mode 100755 index 0000000..cf9f6ab --- /dev/null +++ b/kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch @@ -0,0 +1,62 @@ +From d48034cc2b331313995c1d19060decc0e5ca1356 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 14 Jan 2021 01:35:41 -0500 +Subject: [PATCH 17/17] hw/net/e1000e: advance desc_offset in case of null + descriptor +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210114013541.956735-2-jmaloy@redhat.com> +Patchwork-id: 100638 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] hw/net/e1000e: advance desc_offset in case of null descriptor +Bugzilla: 1903070 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Thomas Huth + +From: Prasad J Pandit + +While receiving packets via e1000e_write_packet_to_guest() routine, +'desc_offset' is advanced only when RX descriptor is processed. And +RX descriptor is not processed if it has NULL buffer address. +This may lead to an infinite loop condition. Increament 'desc_offset' +to process next descriptor in the ring to avoid infinite loop. + +Reported-by: Cheol-woo Myung <330cjfdn@gmail.com> +Signed-off-by: Prasad J Pandit +Signed-off-by: Jason Wang + +(cherry picked from c2cb511634012344e3d0fe49a037a33b12d8a98a) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/e1000e_core.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c +index 9b76f82db5b..166054f2e3f 100644 +--- a/hw/net/e1000e_core.c ++++ b/hw/net/e1000e_core.c +@@ -1596,13 +1596,13 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, + (const char *) &fcs_pad, e1000x_fcs_len(core->mac)); + } + } +- desc_offset += desc_size; +- if (desc_offset >= total_size) { +- is_last = true; +- } + } else { /* as per intel docs; skip descriptors with null buf addr */ + trace_e1000e_rx_null_descriptor(); + } ++ desc_offset += desc_size; ++ if (desc_offset >= total_size) { ++ is_last = true; ++ } + + e1000e_write_rx_descr(core, desc, is_last ? core->rx_pkt : NULL, + rss_info, do_ps ? ps_hdr_len : 0, &bastate.written); +-- +2.27.0 + diff --git a/kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch b/kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch new file mode 100755 index 0000000..228bdff --- /dev/null +++ b/kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch @@ -0,0 +1,56 @@ +From 94ca0eddc117b57da009dacb19740fc8ae00143a Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Mon, 28 Sep 2020 18:27:35 -0400 +Subject: [PATCH] hw/net/net_tx_pkt: fix assertion failure in + net_tx_pkt_add_raw_fragment() + +RH-Author: Jon Maloy +Message-id: <20200928182735.1008839-2-jmaloy@redhat.com> +Patchwork-id: 98497 +O-Subject: [RHEL-8.0.0 qemu-kvm PATCH 1/1] hw/net/net_tx_pkt: fix assertion failure in net_tx_pkt_add_raw_fragment() +Bugzilla: 1860994 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Xiao Wang +RH-Acked-by: Thomas Huth +RH-Acked-by: Stefan Hajnoczi + +From: Mauro Matteo Cascella + +An assertion failure issue was found in the code that processes network packets +while adding data fragments into the packet context. It could be abused by a +malicious guest to abort the QEMU process on the host. This patch replaces the +affected assert() with a conditional statement, returning false if the current +data fragment exceeds max_raw_frags. + +Reported-by: Alexander Bulekov +Reported-by: Ziming Zhang +Reviewed-by: Dmitry Fleytman +Signed-off-by: Mauro Matteo Cascella +Signed-off-by: Jason Wang + +(cherry picked from commit 035e69b063835a5fd23cacabd63690a3d84532a8) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/net_tx_pkt.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index 162f802dd77..54d4c3bbd02 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -379,7 +379,10 @@ bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, + hwaddr mapped_len = 0; + struct iovec *ventry; + assert(pkt); +- assert(pkt->max_raw_frags > pkt->raw_frags); ++ ++ if (pkt->raw_frags >= pkt->max_raw_frags) { ++ return false; ++ } + + if (!len) { + return true; +-- +2.27.0 + diff --git a/kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch b/kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch new file mode 100755 index 0000000..2f4f6dd --- /dev/null +++ b/kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch @@ -0,0 +1,77 @@ +From fe8a9f211fba3588d60507b3d2f48c41d8ee3c79 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Mon, 4 May 2020 21:25:04 +0100 +Subject: [PATCH 1/9] hw/pci/pcie: Forbid hot-plug if it's disabled on the slot + +RH-Author: Julia Suvorova +Message-id: <20200504212505.15977-2-jusual@redhat.com> +Patchwork-id: 96257 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] hw/pci/pcie: Forbid hot-plug if it's disabled on the slot +Bugzilla: 1820531 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Peter Xu + +Raise an error when trying to hot-plug/unplug a device through QMP to a device +with disabled hot-plug capability. This makes the device behaviour more +consistent and provides an explanation of the failure in the case of +asynchronous unplug. + +Signed-off-by: Julia Suvorova +Message-Id: <20200427182440.92433-2-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Marcel Apfelbaum +(cherry picked from commit 0501e1aa1d32a6e02dd06a79bba97fbe9d557cb5) +Signed-off-by: Danilo C. L. de Paula +--- + hw/pci/pcie.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 0eb3a2a..6b48d04 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -415,6 +415,7 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + { + PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); + uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; ++ uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); + PCIDevice *pci_dev = PCI_DEVICE(dev); + + /* Don't send event when device is enabled during qemu machine creation: +@@ -430,6 +431,13 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + return; + } + ++ /* Check if hot-plug is disabled on the slot */ ++ if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { ++ error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", ++ DEVICE(hotplug_pdev)->id); ++ return; ++ } ++ + /* To enable multifunction hot-plug, we just ensure the function + * 0 added last. When function 0 is added, we set the sltsta and + * inform OS via event notification. +@@ -470,6 +478,17 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + Error *local_err = NULL; + PCIDevice *pci_dev = PCI_DEVICE(dev); + PCIBus *bus = pci_get_bus(pci_dev); ++ PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); ++ uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; ++ uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); ++ ++ /* Check if hot-unplug is disabled on the slot */ ++ if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { ++ error_setg(errp, "Hot-unplug failed: " ++ "unsupported by the port device '%s'", ++ DEVICE(hotplug_pdev)->id); ++ return; ++ } + + pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, &local_err); + if (local_err) { +-- +1.8.3.1 + diff --git a/kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch b/kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch new file mode 100755 index 0000000..0c44c77 --- /dev/null +++ b/kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch @@ -0,0 +1,90 @@ +From 035f8aaabf2c31cd6206bff6da23a12fee69d1b7 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Tue, 16 Jun 2020 12:25:36 -0400 +Subject: [PATCH 1/3] hw/pci/pcie: Move hot plug capability check to pre_plug + callback + +RH-Author: Julia Suvorova +Message-id: <20200616122536.1027685-1-jusual@redhat.com> +Patchwork-id: 97548 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] hw/pci/pcie: Move hot plug capability check to pre_plug callback +Bugzilla: 1820531 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Auger Eric +RH-Acked-by: Sergio Lopez Pascual + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1820531 +BRANCH: rhel-av-8.2.1 +UPSTREAM: merged +BREW: 29422092 + +Check for hot plug capability earlier to avoid removing devices attached +during the initialization process. + +Run qemu with an unattached drive: + -drive file=$FILE,if=none,id=drive0 \ + -device pcie-root-port,id=rp0,slot=3,bus=pcie.0,hotplug=off +Hotplug a block device: + device_add virtio-blk-pci,id=blk0,drive=drive0,bus=rp0 +If hotplug fails on plug_cb, drive0 will be deleted. + +Fixes: 0501e1aa1d32a6 ("hw/pci/pcie: Forbid hot-plug if it's disabled on the slot") + +Acked-by: Igor Mammedov +Signed-off-by: Julia Suvorova +Message-Id: <20200604125947.881210-1-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 0dabc0f6544f2c0310546f6d6cf3b68979580a9c) +Signed-off-by: Eduardo Lima (Etrunko) +--- + hw/pci/pcie.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index abc99b6eff..1386dd228c 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -407,6 +407,17 @@ static void pcie_cap_slot_plug_common(PCIDevice *hotplug_dev, DeviceState *dev, + void pcie_cap_slot_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) + { ++ PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); ++ uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; ++ uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); ++ ++ /* Check if hot-plug is disabled on the slot */ ++ if (dev->hotplugged && (sltcap & PCI_EXP_SLTCAP_HPC) == 0) { ++ error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", ++ DEVICE(hotplug_pdev)->id); ++ return; ++ } ++ + pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, errp); + } + +@@ -415,7 +426,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + { + PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); + uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; +- uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); + PCIDevice *pci_dev = PCI_DEVICE(dev); + + /* Don't send event when device is enabled during qemu machine creation: +@@ -431,13 +441,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + return; + } + +- /* Check if hot-plug is disabled on the slot */ +- if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { +- error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", +- DEVICE(hotplug_pdev)->id); +- return; +- } +- + /* To enable multifunction hot-plug, we just ensure the function + * 0 added last. When function 0 is added, we set the sltsta and + * inform OS via event notification. +-- +2.27.0 + diff --git a/kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch b/kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch new file mode 100755 index 0000000..51a587f --- /dev/null +++ b/kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch @@ -0,0 +1,62 @@ +From f98a1fdad0aa53337925ac46b73a3e6ad36f6295 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Mon, 4 May 2020 21:25:05 +0100 +Subject: [PATCH 2/9] hw/pci/pcie: Replace PCI_DEVICE() casts with existing + variable + +RH-Author: Julia Suvorova +Message-id: <20200504212505.15977-3-jusual@redhat.com> +Patchwork-id: 96259 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] hw/pci/pcie: Replace PCI_DEVICE() casts with existing variable +Bugzilla: 1820531 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Peter Xu + +A little cleanup is possible because of hotplug_pdev introduction. + +Signed-off-by: Julia Suvorova +Message-Id: <20200427182440.92433-3-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Marcel Apfelbaum +(cherry picked from commit 6a1e073378353eb6ac0565e0dc649b3db76ed5dc) +Signed-off-by: Danilo C. L. de Paula +--- + hw/pci/pcie.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 6b48d04..abc99b6 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -449,7 +449,7 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA, + PCI_EXP_LNKSTA_DLLLA); + } +- pcie_cap_slot_event(PCI_DEVICE(hotplug_dev), ++ pcie_cap_slot_event(hotplug_pdev, + PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + } + } +@@ -490,7 +490,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + return; + } + +- pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, &local_err); ++ pcie_cap_slot_plug_common(hotplug_pdev, dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; +@@ -509,7 +509,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + return; + } + +- pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev)); ++ pcie_cap_slot_push_attention_button(hotplug_pdev); + } + + /* pci express slot for pci express root/downstream port +-- +1.8.3.1 + diff --git a/kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch b/kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch new file mode 100755 index 0000000..9ea4c38 --- /dev/null +++ b/kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch @@ -0,0 +1,61 @@ +From 60b05771b8afc08e0ca9956658d2c55cd1739652 Mon Sep 17 00:00:00 2001 +From: Mauro Matteo Cascella +Date: Thu, 4 Nov 2021 17:31:38 +0100 +Subject: [PATCH 1/2] hw/scsi/scsi-disk: MODE_PAGE_ALLS not allowed in MODE + SELECT commands +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 70: hw/scsi/scsi-disk: MODE_PAGE_ALLS not allowed in MODE SELECT commands +RH-Commit: [1/1] bd3de8bdf48aa6c522612505d08c23dafb122a34 (jmaloy/qemu-kvm) +RH-Bugzilla: 2025605 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Laurent Vivier +RH-Acked-by: Philippe Mathieu-Daudé + +This avoids an off-by-one read of 'mode_sense_valid' buffer in +hw/scsi/scsi-disk.c:mode_sense_page(). + +Fixes: CVE-2021-3930 +Cc: qemu-stable@nongnu.org +Reported-by: Alexander Bulekov +Fixes: a8f4bbe2900 ("scsi-disk: store valid mode pages in a table") +Fixes: #546 +Reported-by: Qiuhao Li +Signed-off-by: Mauro Matteo Cascella +Signed-off-by: Paolo Bonzini +(cherry picked from commit b3af7fdf9cc537f8f0dd3e2423d83f5c99a457e8) +Signed-off-by: Jon Maloy +--- + hw/scsi/scsi-disk.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c +index 5cb5fd35bd..1d0ea72289 100644 +--- a/hw/scsi/scsi-disk.c ++++ b/hw/scsi/scsi-disk.c +@@ -1086,6 +1086,7 @@ static int mode_sense_page(SCSIDiskState *s, int page, uint8_t **p_outbuf, + uint8_t *p = *p_outbuf + 2; + int length; + ++ assert(page < ARRAY_SIZE(mode_sense_valid)); + if ((mode_sense_valid[page] & (1 << s->qdev.type)) == 0) { + return -1; + } +@@ -1427,6 +1428,11 @@ static int scsi_disk_check_mode_select(SCSIDiskState *s, int page, + return -1; + } + ++ /* MODE_PAGE_ALLS is only valid for MODE SENSE commands */ ++ if (page == MODE_PAGE_ALLS) { ++ return -1; ++ } ++ + p = mode_current; + memset(mode_current, 0, inlen + 2); + len = mode_sense_page(s, page, &p, 0); +-- +2.27.0 + diff --git a/kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch b/kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch new file mode 100755 index 0000000..0f0f126 --- /dev/null +++ b/kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch @@ -0,0 +1,262 @@ +From e6c3fbfc82863180007569cf2a9132c28a47bf1f Mon Sep 17 00:00:00 2001 +From: "Daniel P. Berrange" +Date: Mon, 20 Jan 2020 16:13:08 +0000 +Subject: [PATCH 01/18] hw/smbios: set new default SMBIOS fields for Windows + driver support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Daniel P. Berrange +Message-id: <20200120161308.584989-2-berrange@redhat.com> +Patchwork-id: 93422 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] hw/smbios: set new default SMBIOS fields for Windows driver support +Bugzilla: 1782529 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Igor Mammedov +RH-Acked-by: Laszlo Ersek + +For Windows driver support, we have to follow this doc in order to +enable Windows to automatically determine the right drivers to install +for a given guest / host combination: + + https://docs.microsoft.com/en-us/windows-hardware/drivers/install/specifying-hardware-ids-for-a-computer + +Out of the choices available, it was decided that the Windows drivers +will be written to expect use of the scheme documented as "HardwareID-6" +against Windows 10. This uses SMBIOS System (Type 1) and Base Board +(Type 2) tables and will match on + + System Manufacturer = Red Hat + System SKU Number = 8.2.0 + Baseboard Manufacturer = Red Hat + Baseboard Product = RHEL-AV + +The new SMBIOS fields will be tied to machine type and only reported for +pc-q35-8.2.0 machine and later. + +The old SMBIOS fields, previously reported by all machines were: + + System Manufacturer: Red Hat + System Product Name: KVM + System Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + System Family: Red Hat Enterprise Linux + Baseboard Manufacturer: Red Hat + Baseboard Product Name: KVM + Baseboard Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + Chassis Manufacturer: Red Hat + Chassis Product Name: KVM + Chassis Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + Processor Manufacturer: Red Hat + Processor Product Name: KVM + Processor Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + +This information will continue to be reported for all machines, except +where it conflicts with the requirement of the new SMBIOS data. IOW, +the "Baseboard Product Name" will change to "RHEL-AV" for pc-q35-8.2.0 +machine types and later. + +Management applications MUST NEVER override the 4 new SMBIOS fields that +are used for Windows driver matching, with differing values. Aside from +this, they are free to override any other field, including those from +the old SMBIOS field data. + +In particular if a management application wants to report its own +product name and version, it is recommended to use "System product" +and "System version" as identifying fields, as these avoid a clash with +the new SMBIOS fields used for Windows drivers. + +Note that until now the Baseboard (type 2) table has only been generated +by QEMU if explicitly asked for on the CLI. This patch makes it always +present for new machine types. + +Signed-off-by: Daniel P. Berrangé +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/virt.c | 2 +- + hw/i386/pc_piix.c | 2 ++ + hw/i386/pc_q35.c | 8 ++++++++ + hw/smbios/smbios.c | 45 +++++++++++++++++++++++++++++++++++++++++--- + include/hw/firmware/smbios.h | 5 ++++- + include/hw/i386/pc.h | 3 +++ + 6 files changed, 60 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index d30d38c..2dcf6e7 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -1423,7 +1423,7 @@ static void virt_build_smbios(VirtMachineState *vms) + + smbios_set_defaults("QEMU", product, + vmc->smbios_old_sys_ver ? "1.0" : mc->name, false, +- true, SMBIOS_ENTRY_POINT_30); ++ true, NULL, NULL, SMBIOS_ENTRY_POINT_30); + + smbios_get_tables(MACHINE(vms), NULL, 0, &smbios_tables, &smbios_tables_len, + &smbios_anchor, &smbios_anchor_len); +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index bd7fdb9..2ac94d5 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -177,6 +177,8 @@ static void pc_init1(MachineState *machine, + smbios_set_defaults("Red Hat", "KVM", + mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, ++ pcmc->smbios_stream_product, ++ pcmc->smbios_stream_version, + SMBIOS_ENTRY_POINT_21); + } + +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 7531d8e..e975643 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -200,6 +200,8 @@ static void pc_q35_init(MachineState *machine) + smbios_set_defaults("Red Hat", "KVM", + mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, ++ pcmc->smbios_stream_product, ++ pcmc->smbios_stream_version, + SMBIOS_ENTRY_POINT_21); + } + +@@ -565,8 +567,11 @@ static void pc_q35_init_rhel820(MachineState *machine) + + static void pc_q35_machine_rhel820_options(MachineClass *m) + { ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_machine_rhel_options(m); + m->desc = "RHEL-8.2.0 PC (Q35 + ICH9, 2009)"; ++ pcmc->smbios_stream_product = "RHEL-AV"; ++ pcmc->smbios_stream_version = "8.2.0"; + } + + DEFINE_PC_MACHINE(q35_rhel820, "pc-q35-rhel8.2.0", pc_q35_init_rhel820, +@@ -579,9 +584,12 @@ static void pc_q35_init_rhel810(MachineState *machine) + + static void pc_q35_machine_rhel810_options(MachineClass *m) + { ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_machine_rhel820_options(m); + m->desc = "RHEL-8.1.0 PC (Q35 + ICH9, 2009)"; + m->alias = NULL; ++ pcmc->smbios_stream_product = NULL; ++ pcmc->smbios_stream_version = NULL; + compat_props_add(m->compat_props, hw_compat_rhel_8_1, hw_compat_rhel_8_1_len); + compat_props_add(m->compat_props, pc_rhel_8_1_compat, pc_rhel_8_1_compat_len); + } +diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c +index e6e9355..d65c149 100644 +--- a/hw/smbios/smbios.c ++++ b/hw/smbios/smbios.c +@@ -57,6 +57,9 @@ static bool smbios_legacy = true; + static bool smbios_uuid_encoded = true; + /* end: legacy structures & constants for <= 2.0 machines */ + ++/* Set to true for modern Windows 10 HardwareID-6 compat */ ++static bool smbios_type2_required; ++ + + uint8_t *smbios_tables; + size_t smbios_tables_len; +@@ -532,7 +535,7 @@ static void smbios_build_type_1_table(void) + + static void smbios_build_type_2_table(void) + { +- SMBIOS_BUILD_TABLE_PRE(2, 0x200, false); /* optional */ ++ SMBIOS_BUILD_TABLE_PRE(2, 0x200, smbios_type2_required); + + SMBIOS_TABLE_SET_STR(2, manufacturer_str, type2.manufacturer); + SMBIOS_TABLE_SET_STR(2, product_str, type2.product); +@@ -753,7 +756,10 @@ void smbios_set_cpuid(uint32_t version, uint32_t features) + + void smbios_set_defaults(const char *manufacturer, const char *product, + const char *version, bool legacy_mode, +- bool uuid_encoded, SmbiosEntryPointType ep_type) ++ bool uuid_encoded, ++ const char *stream_product, ++ const char *stream_version, ++ SmbiosEntryPointType ep_type) + { + smbios_have_defaults = true; + smbios_legacy = legacy_mode; +@@ -774,12 +780,45 @@ void smbios_set_defaults(const char *manufacturer, const char *product, + g_free(smbios_entries); + } + ++ /* ++ * If @stream_product & @stream_version are non-NULL, then ++ * we're following rules for new Windows driver support. ++ * The data we have to report is defined in this doc: ++ * ++ * https://docs.microsoft.com/en-us/windows-hardware/drivers/install/specifying-hardware-ids-for-a-computer ++ * ++ * The Windows drivers are written to expect use of the ++ * scheme documented as "HardwareID-6" against Windows 10, ++ * which uses SMBIOS System (Type 1) and Base Board (Type 2) ++ * tables and will match on ++ * ++ * System Manufacturer = Red Hat (@manufacturer) ++ * System SKU Number = 8.2.0 (@stream_version) ++ * Baseboard Manufacturer = Red Hat (@manufacturer) ++ * Baseboard Product = RHEL-AV (@stream_product) ++ * ++ * NB, SKU must be changed with each RHEL-AV release ++ * ++ * Other fields can be freely used by applications using ++ * QEMU. For example apps can use the "System product" ++ * and "System version" to identify themselves. ++ * ++ * We get 'System Manufacturer' and 'Baseboard Manufacturer' ++ */ + SMBIOS_SET_DEFAULT(type1.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type1.product, product); + SMBIOS_SET_DEFAULT(type1.version, version); + SMBIOS_SET_DEFAULT(type1.family, "Red Hat Enterprise Linux"); ++ if (stream_version != NULL) { ++ SMBIOS_SET_DEFAULT(type1.sku, stream_version); ++ } + SMBIOS_SET_DEFAULT(type2.manufacturer, manufacturer); +- SMBIOS_SET_DEFAULT(type2.product, product); ++ if (stream_product != NULL) { ++ SMBIOS_SET_DEFAULT(type2.product, stream_product); ++ smbios_type2_required = true; ++ } else { ++ SMBIOS_SET_DEFAULT(type2.product, product); ++ } + SMBIOS_SET_DEFAULT(type2.version, version); + SMBIOS_SET_DEFAULT(type3.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type3.version, version); +diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h +index 02a0ced..67e38a1 100644 +--- a/include/hw/firmware/smbios.h ++++ b/include/hw/firmware/smbios.h +@@ -267,7 +267,10 @@ void smbios_entry_add(QemuOpts *opts, Error **errp); + void smbios_set_cpuid(uint32_t version, uint32_t features); + void smbios_set_defaults(const char *manufacturer, const char *product, + const char *version, bool legacy_mode, +- bool uuid_encoded, SmbiosEntryPointType ep_type); ++ bool uuid_encoded, ++ const char *stream_product, ++ const char *stream_version, ++ SmbiosEntryPointType ep_type); + uint8_t *smbios_get_table_legacy(MachineState *ms, size_t *length); + void smbios_get_tables(MachineState *ms, + const struct smbios_phys_mem_area *mem_array, +diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h +index 2e362c8..b9f29ba 100644 +--- a/include/hw/i386/pc.h ++++ b/include/hw/i386/pc.h +@@ -109,6 +109,9 @@ typedef struct PCMachineClass { + bool smbios_defaults; + bool smbios_legacy_mode; + bool smbios_uuid_encoded; ++ /* New fields needed for Windows HardwareID-6 matching */ ++ const char *smbios_stream_product; ++ const char *smbios_stream_version; + + /* RAM / address space compat: */ + bool gigabyte_align; +-- +1.8.3.1 + diff --git a/kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch b/kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch new file mode 100755 index 0000000..b2cc438 --- /dev/null +++ b/kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch @@ -0,0 +1,199 @@ +From 1bee5a77b3f999d2933a440021737d0720b32269 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 29 Jul 2020 18:56:21 -0400 +Subject: [PATCH 1/4] i386: Add 2nd Generation AMD EPYC processors + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200729185621.152427-2-dgilbert@redhat.com> +Patchwork-id: 98078 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] i386: Add 2nd Generation AMD EPYC processors +Bugzilla: 1780385 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Maxim Levitsky + +From: "Moger, Babu" + +Adds the support for 2nd Gen AMD EPYC Processors. The model display +name will be EPYC-Rome. + +Adds the following new feature bits on top of the feature bits from the +first generation EPYC models. +perfctr-core : core performance counter extensions support. Enables the VM to + use extended performance counter support. It enables six + programmable counters instead of four counters. +clzero : instruction zeroes out the 64 byte cache line specified in RAX. +xsaveerptr : XSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES always save error + pointers and FXRSTOR, XRSTOR, XRSTORS always restore error + pointers. +wbnoinvd : Write back and do not invalidate cache +ibpb : Indirect Branch Prediction Barrier +amd-stibp : Single Thread Indirect Branch Predictor +clwb : Cache Line Write Back and Retain +xsaves : XSAVES, XRSTORS and IA32_XSS support +rdpid : Read Processor ID instruction support +umip : User-Mode Instruction Prevention support + +The Reference documents are available at +https://developer.amd.com/wp-content/resources/55803_0.54-PUB.pdf +https://www.amd.com/system/files/TechDocs/24594.pdf + +Depends on following kernel commits: +40bc47b08b6e ("kvm: x86: Enumerate support for CLZERO instruction") +504ce1954fba ("KVM: x86: Expose XSAVEERPTR to the guest") +6d61e3c32248 ("kvm: x86: Expose RDPID in KVM_GET_SUPPORTED_CPUID") +52297436199d ("kvm: svm: Update svm_xsaves_supported") + +Signed-off-by: Babu Moger +Message-Id: <157314966312.23828.17684821666338093910.stgit@naples-babu.amd.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 143c30d4d346831a09e59e9af45afdca0331e819) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 102 +++++++++++++++++++++++++++++++++++++++++++++- + target/i386/cpu.h | 2 + + 2 files changed, 103 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index a343de0c9d..ff39fc9905 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1133,7 +1133,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "clzero", NULL, "xsaveerptr", NULL, + NULL, NULL, NULL, NULL, + NULL, "wbnoinvd", NULL, NULL, +- "ibpb", NULL, NULL, NULL, ++ "ibpb", NULL, NULL, "amd-stibp", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "amd-ssbd", "virt-ssbd", "amd-no-ssb", NULL, +@@ -1803,6 +1803,56 @@ static CPUCaches epyc_cache_info = { + }, + }; + ++static CPUCaches epyc_rome_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 16 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 16384, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .complex_indexing = true, ++ }, ++}; ++ + /* The following VMX features are not supported by KVM and are left out in the + * CPU definitions: + * +@@ -4024,6 +4074,56 @@ static X86CPUDefinition builtin_x86_defs[] = { + .model_id = "Hygon Dhyana Processor", + .cache_info = &epyc_cache_info, + }, ++ { ++ .name = "EPYC-Rome", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 23, ++ .model = 49, ++ .stepping = 0, ++ .features[FEAT_1_EDX] = ++ CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | ++ CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | ++ CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | ++ CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | ++ CPUID_VME | CPUID_FP87, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | ++ CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | ++ CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | ++ CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 | ++ CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | ++ CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | ++ CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | ++ CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, ++ .features[FEAT_8000_0008_EBX] = ++ CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | ++ CPUID_8000_0008_EBX_WBNOINVD | CPUID_8000_0008_EBX_IBPB | ++ CPUID_8000_0008_EBX_STIBP, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | ++ CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED | ++ CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | ++ CPUID_7_0_EBX_SHA_NI | CPUID_7_0_EBX_CLWB, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_RDPID, ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .features[FEAT_SVM] = ++ CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE, ++ .xlevel = 0x8000001E, ++ .model_id = "AMD EPYC-Rome Processor", ++ .cache_info = &epyc_rome_cache_info, ++ }, + }; + + /* KVM-specific features that are automatically added/removed +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 7bfbf2a5e5..f3da25cb8a 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -792,6 +792,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_8000_0008_EBX_WBNOINVD (1U << 9) + /* Indirect Branch Prediction Barrier */ + #define CPUID_8000_0008_EBX_IBPB (1U << 12) ++/* Single Thread Indirect Branch Predictors */ ++#define CPUID_8000_0008_EBX_STIBP (1U << 15) + + #define CPUID_XSAVE_XSAVEOPT (1U << 0) + #define CPUID_XSAVE_XSAVEC (1U << 1) +-- +2.27.0 + diff --git a/kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch b/kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch new file mode 100755 index 0000000..823ff0c --- /dev/null +++ b/kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch @@ -0,0 +1,46 @@ +From cdafcc1d68110ed172c09c9e6bba42ee15b5a6df Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:40 +0100 +Subject: [PATCH 13/17] i386: Add MSR feature bit for MDS-NO + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-2-plai@redhat.com> +Patchwork-id: 96609 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 1/4] i386: Add MSR feature bit for MDS-NO +Bugzilla: 1769912 +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Dr. David Alan Gilbert + +From: Cathy Zhang + +Define MSR_ARCH_CAP_MDS_NO in the IA32_ARCH_CAPABILITIES MSR to allow +CPU models to report the feature when host supports it. + +Signed-off-by: Cathy Zhang +Reviewed-by: Xiaoyao Li +Reviewed-by: Tao Xu +Message-Id: <1571729728-23284-2-git-send-email-cathy.zhang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 77b168d221191156c47fcd8d1c47329dfdb9439e) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 4441061..60304cc 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -839,6 +839,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define MSR_ARCH_CAP_RSBA (1U << 2) + #define MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY (1U << 3) + #define MSR_ARCH_CAP_SSB_NO (1U << 4) ++#define MSR_ARCH_CAP_MDS_NO (1U << 5) + + #define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5) + +-- +1.8.3.1 + diff --git a/kvm-i386-Add-macro-for-stibp.patch b/kvm-i386-Add-macro-for-stibp.patch new file mode 100755 index 0000000..17dd149 --- /dev/null +++ b/kvm-i386-Add-macro-for-stibp.patch @@ -0,0 +1,49 @@ +From 00f916987589f114f42ce20b138c00c47b9e4df7 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:41 +0100 +Subject: [PATCH 14/17] i386: Add macro for stibp + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-3-plai@redhat.com> +Patchwork-id: 96610 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 2/4] i386: Add macro for stibp +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Cathy Zhang + +stibp feature is already added through the following commit. +https://github.com/qemu/qemu/commit/0e8916582991b9fd0b94850a8444b8b80d0a0955 + +Add a macro for it to allow CPU models to report it when host supports. + +Signed-off-by: Cathy Zhang +Reviewed-by: Xiaoyao Li +Reviewed-by: Tao Xu +Message-Id: <1571729728-23284-3-git-send-email-cathy.zhang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 5af514d0cb314f43bc53f2aefb437f6451d64d0c) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 60304cc..e77d101 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -772,6 +772,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) + /* Speculation Control */ + #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) ++/* Single Thread Indirect Branch Predictors */ ++#define CPUID_7_0_EDX_STIBP (1U << 27) + /* Arch Capabilities */ + #define CPUID_7_0_EDX_ARCH_CAPABILITIES (1U << 29) + /* Core Capability */ +-- +1.8.3.1 + diff --git a/kvm-i386-Add-new-CPU-model-Cooperlake.patch b/kvm-i386-Add-new-CPU-model-Cooperlake.patch new file mode 100755 index 0000000..289d1e3 --- /dev/null +++ b/kvm-i386-Add-new-CPU-model-Cooperlake.patch @@ -0,0 +1,108 @@ +From cf62577aed781b2515ea97b9f42285c2f608a7bf Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:42 +0100 +Subject: [PATCH 16/17] i386: Add new CPU model Cooperlake + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-4-plai@redhat.com> +Patchwork-id: 96608 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 3/4] i386: Add new CPU model Cooperlake +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Cathy Zhang + +Cooper Lake is intel's successor to Cascade Lake, the new +CPU model inherits features from Cascadelake-Server, while +add one platform associated new feature: AVX512_BF16. Meanwhile, +add STIBP for speculative execution. + +Signed-off-by: Cathy Zhang +Reviewed-by: Xiaoyao Li +Reviewed-by: Tao Xu +Message-Id: <1571729728-23284-4-git-send-email-cathy.zhang@intel.com> +Reviewed-by: Bruce Rogers +Signed-off-by: Eduardo Habkost +(cherry picked from commit 22a866b6166db5caa4abaa6e656c2a431fa60726) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 60 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 0f0a2db..996a74f 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3161,6 +3161,66 @@ static X86CPUDefinition builtin_x86_defs[] = { + } + }, + { ++ .name = "Cooperlake", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_INTEL, ++ .family = 6, ++ .model = 85, ++ .stepping = 10, ++ .features[FEAT_1_EDX] = ++ CPUID_VME | CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | ++ CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | ++ CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | ++ CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | ++ CPUID_DE | CPUID_FP87, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_AVX | CPUID_EXT_XSAVE | CPUID_EXT_AES | ++ CPUID_EXT_POPCNT | CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | ++ CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | ++ CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | ++ CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_FMA | CPUID_EXT_MOVBE | ++ CPUID_EXT_PCID | CPUID_EXT_F16C | CPUID_EXT_RDRAND, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_PDPE1GB | CPUID_EXT2_RDTSCP | ++ CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_ABM | CPUID_EXT3_LAHF_LM | CPUID_EXT3_3DNOWPREFETCH, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | ++ CPUID_7_0_EBX_HLE | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | ++ CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | ++ CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | ++ CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLWB | ++ CPUID_7_0_EBX_AVX512F | CPUID_7_0_EBX_AVX512DQ | ++ CPUID_7_0_EBX_AVX512BW | CPUID_7_0_EBX_AVX512CD | ++ CPUID_7_0_EBX_AVX512VL | CPUID_7_0_EBX_CLFLUSHOPT, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_PKU | ++ CPUID_7_0_ECX_AVX512VNNI, ++ .features[FEAT_7_0_EDX] = ++ CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_STIBP | ++ CPUID_7_0_EDX_SPEC_CTRL_SSBD | CPUID_7_0_EDX_ARCH_CAPABILITIES, ++ .features[FEAT_ARCH_CAPABILITIES] = ++ MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | ++ MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO, ++ .features[FEAT_7_1_EAX] = ++ CPUID_7_1_EAX_AVX512_BF16, ++ /* ++ * Missing: XSAVES (not supported by some Linux versions, ++ * including v4.1 to v4.12). ++ * KVM doesn't yet expose any XSAVES state save component, ++ * and the only one defined in Skylake (processor tracing) ++ * probably will block migration anyway. ++ */ ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .xlevel = 0x80000008, ++ .model_id = "Intel Xeon Processor (Cooperlake)", ++ }, ++ { + .name = "Icelake-Client", + .level = 0xd, + .vendor = CPUID_VENDOR_INTEL, +-- +1.8.3.1 + diff --git a/kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch b/kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch new file mode 100755 index 0000000..5c335f8 --- /dev/null +++ b/kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch @@ -0,0 +1,213 @@ +From 4daa8dca77edec191dfe0ae4a0a9fc70f8f63607 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:37 -0500 +Subject: [PATCH 4/4] i386: Add the support for AMD EPYC 3rd generation + processors + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-5-dgilbert@redhat.com> +Patchwork-id: 101202 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/4] i386: Add the support for AMD EPYC 3rd generation processors +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Babu Moger + +Adds the support for AMD 3rd generation processors. The model +display for the new processor will be EPYC-Milan. + +Adds the following new feature bits on top of the feature bits from +the first and second generation EPYC models. + +pcid : Process context identifiers support +ibrs : Indirect Branch Restricted Speculation +ssbd : Speculative Store Bypass Disable +erms : Enhanced REP MOVSB/STOSB support +fsrm : Fast Short REP MOVSB support +invpcid : Invalidate processor context ID +pku : Protection keys support +svme-addr-chk : SVM instructions address check for #GP handling + +Depends on the following kernel commits: +14c2bf81fcd2 ("KVM: SVM: Fix #GP handling for doubly-nested virtualization") +3b9c723ed7cf ("KVM: SVM: Add support for SVM instruction address check change") +4aa2691dcbd3 ("8ce1c461188799d863398dd2865d KVM: x86: Factor out x86 instruction emulation with decoding") +4407a797e941 ("KVM: SVM: Enable INVPCID feature on AMD") +9715092f8d7e ("KVM: X86: Move handling of INVPCID types to x86") +3f3393b3ce38 ("KVM: X86: Rename and move the function vmx_handle_memory_failure to x86.c") +830bd71f2c06 ("KVM: SVM: Remove set_cr_intercept, clr_cr_intercept and is_cr_intercept") +4c44e8d6c193 ("KVM: SVM: Add new intercept word in vmcb_control_area") +c62e2e94b9d4 ("KVM: SVM: Modify 64 bit intercept field to two 32 bit vectors") +9780d51dc2af ("KVM: SVM: Modify intercept_exceptions to generic intercepts") +30abaa88382c ("KVM: SVM: Change intercept_dr to generic intercepts") +03bfeeb988a9 ("KVM: SVM: Change intercept_cr to generic intercepts") +c45ad7229d13 ("KVM: SVM: Introduce vmcb_(set_intercept/clr_intercept/_is_intercept)") +a90c1ed9f11d ("(pcid) KVM: nSVM: Remove unused field") +fa44b82eb831 ("KVM: x86: Move MPK feature detection to common code") +38f3e775e9c2 ("x86/Kconfig: Update config and kernel doc for MPK feature on AMD") +37486135d3a7 ("KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it to x86.c") + +Signed-off-by: Babu Moger +Message-Id: <161290460478.11352.8933244555799318236.stgit@bmoger-ubuntu> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 623972ceae091b31331ae4a1dc94fe5cbb891937) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 107 +++++++++++++++++++++++++++++++++++++++++++++- + target/i386/cpu.h | 4 ++ + 2 files changed, 110 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 7227c803c3..d5b0d4b7f0 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1133,7 +1133,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "clzero", NULL, "xsaveerptr", NULL, + NULL, NULL, NULL, NULL, + NULL, "wbnoinvd", NULL, NULL, +- "ibpb", NULL, NULL, "amd-stibp", ++ "ibpb", NULL, "ibrs", "amd-stibp", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "amd-ssbd", "virt-ssbd", "amd-no-ssb", NULL, +@@ -1853,6 +1853,56 @@ static CPUCaches epyc_rome_cache_info = { + }, + }; + ++static CPUCaches epyc_milan_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 32 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 32768, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .complex_indexing = true, ++ }, ++}; ++ + /* The following VMX features are not supported by KVM and are left out in the + * CPU definitions: + * +@@ -4124,6 +4174,61 @@ static X86CPUDefinition builtin_x86_defs[] = { + .model_id = "AMD EPYC-Rome Processor", + .cache_info = &epyc_rome_cache_info, + }, ++ { ++ .name = "EPYC-Milan", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 25, ++ .model = 1, ++ .stepping = 1, ++ .features[FEAT_1_EDX] = ++ CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | ++ CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | ++ CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | ++ CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | ++ CPUID_VME | CPUID_FP87, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | ++ CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | ++ CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | ++ CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 | ++ CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | ++ CPUID_EXT_PCID, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | ++ CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | ++ CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | ++ CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, ++ .features[FEAT_8000_0008_EBX] = ++ CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | ++ CPUID_8000_0008_EBX_WBNOINVD | CPUID_8000_0008_EBX_IBPB | ++ CPUID_8000_0008_EBX_IBRS | CPUID_8000_0008_EBX_STIBP | ++ CPUID_8000_0008_EBX_AMD_SSBD, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | ++ CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED | ++ CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | ++ CPUID_7_0_EBX_SHA_NI | CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_ERMS | ++ CPUID_7_0_EBX_INVPCID, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_PKU, ++ .features[FEAT_7_0_EDX] = ++ CPUID_7_0_EDX_FSRM, ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .features[FEAT_SVM] = ++ CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE | CPUID_SVM_SVME_ADDR_CHK, ++ .xlevel = 0x8000001E, ++ .model_id = "AMD EPYC-Milan Processor", ++ .cache_info = &epyc_milan_cache_info, ++ }, + }; + + /* KVM-specific features that are automatically added/removed +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index e1b67910c2..7a3aa40201 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -800,8 +800,12 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_8000_0008_EBX_WBNOINVD (1U << 9) + /* Indirect Branch Prediction Barrier */ + #define CPUID_8000_0008_EBX_IBPB (1U << 12) ++/* Indirect Branch Restricted Speculation */ ++#define CPUID_8000_0008_EBX_IBRS (1U << 14) + /* Single Thread Indirect Branch Predictors */ + #define CPUID_8000_0008_EBX_STIBP (1U << 15) ++/* Speculative Store Bypass Disable */ ++#define CPUID_8000_0008_EBX_AMD_SSBD (1U << 24) + + #define CPUID_XSAVE_XSAVEOPT (1U << 0) + #define CPUID_XSAVE_XSAVEC (1U << 1) +-- +2.27.0 + diff --git a/kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch b/kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch new file mode 100755 index 0000000..17251bf --- /dev/null +++ b/kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch @@ -0,0 +1,82 @@ +From d3b9c1891a6d05308dd5ea119d2c32c8f98a25da Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Tue, 30 Jun 2020 23:40:15 -0400 +Subject: [PATCH 1/4] i386: Mask SVM features if nested SVM is disabled + +RH-Author: Eduardo Habkost +Message-id: <20200630234015.166253-2-ehabkost@redhat.com> +Patchwork-id: 97852 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] i386: Mask SVM features if nested SVM is disabled +Bugzilla: 1835390 +RH-Acked-by: Igor Mammedov +RH-Acked-by: Bandan Das +RH-Acked-by: Dr. David Alan Gilbert + +QEMU incorrectly validates FEAT_SVM feature flags against +GET_SUPPORTED_CPUID even if SVM features are being masked out by +cpu_x86_cpuid(). This can make QEMU print warnings on most AMD +CPU models, even when SVM nesting is disabled (which is the +default). + +This bug was never detected before because of a Linux KVM bug: +until Linux v5.6, KVM was not filtering out SVM features in +GET_SUPPORTED_CPUID when nested was disabled. This KVM bug was +fixed in Linux v5.7-rc1, on Linux commit a50718cc3f43 ("KVM: +nSVM: Expose SVM features to L1 iff nested is enabled"). + +Fix the problem by adding a CPUID_EXT3_SVM dependency to all +FEAT_SVM feature flags in the feature_dependencies table. + +Reported-by: Yanan Fu +Signed-off-by: Eduardo Habkost +Message-Id: <20200623230116.277409-1-ehabkost@redhat.com> +[Fix testcase. - Paolo] +Signed-off-by: Paolo Bonzini +(cherry picked from commit 730319aef0fcb94f11a4a2d32656437fdde7efdd) +Signed-off-by: Eduardo Habkost +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 4 ++++ + tests/test-x86-cpuid-compat.c | 4 ++-- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 7d7b016bb7..a343de0c9d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1477,6 +1477,10 @@ static FeatureDep feature_dependencies[] = { + .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_VMFUNC }, + .to = { FEAT_VMX_VMFUNC, ~0ull }, + }, ++ { ++ .from = { FEAT_8000_0001_ECX, CPUID_EXT3_SVM }, ++ .to = { FEAT_SVM, ~0ull }, ++ }, + }; + + typedef struct X86RegisterInfo32 { +diff --git a/tests/test-x86-cpuid-compat.c b/tests/test-x86-cpuid-compat.c +index e7c075ed98..983aa0719a 100644 +--- a/tests/test-x86-cpuid-compat.c ++++ b/tests/test-x86-cpuid-compat.c +@@ -256,7 +256,7 @@ int main(int argc, char **argv) + "-cpu 486,+invtsc", "xlevel", 0x80000007); + /* CPUID[8000_000A].EDX: */ + add_cpuid_test("x86/cpuid/auto-xlevel/486/npt", +- "-cpu 486,+npt", "xlevel", 0x8000000A); ++ "-cpu 486,+svm,+npt", "xlevel", 0x8000000A); + /* CPUID[C000_0001].EDX: */ + add_cpuid_test("x86/cpuid/auto-xlevel2/phenom/xstore", + "-cpu phenom,+xstore", "xlevel2", 0xC0000001); +@@ -349,7 +349,7 @@ int main(int argc, char **argv) + "-machine pc-i440fx-2.4 -cpu SandyBridge,", + "xlevel", 0x80000008); + add_cpuid_test("x86/cpuid/xlevel-compat/pc-i440fx-2.4/npt-on", +- "-machine pc-i440fx-2.4 -cpu SandyBridge,+npt", ++ "-machine pc-i440fx-2.4 -cpu SandyBridge,+svm,+npt", + "xlevel", 0x80000008); + #endif + +-- +2.27.0 + diff --git a/kvm-i386-Remove-cpu64-rhel6-CPU-model.patch b/kvm-i386-Remove-cpu64-rhel6-CPU-model.patch new file mode 100755 index 0000000..5d62ace --- /dev/null +++ b/kvm-i386-Remove-cpu64-rhel6-CPU-model.patch @@ -0,0 +1,77 @@ +From 4543a3c19816bd07f27eb900f20ae609df03703c Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Mon, 23 Dec 2019 21:10:31 +0000 +Subject: [PATCH 1/2] i386: Remove cpu64-rhel6 CPU model + +RH-Author: Eduardo Habkost +Message-id: <20191223211031.26503-1-ehabkost@redhat.com> +Patchwork-id: 93213 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] i386: Remove cpu64-rhel6 CPU model +Bugzilla: 1741345 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Laszlo Ersek + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1741345 +BRANCH: rhel-av-8.2.0 +Upstream: not applicable +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=25525975 + +We don't provide rhel6 machine types anymore, so we don't need to +provide compatibility with RHEl6. cpu64-rhel6 was documented as +deprecated and scheduled for removal in 8.2, so now it's time to +remove it. + +Signed-off-by: Eduardo Habkost +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 26 +------------------------- + 1 file changed, 1 insertion(+), 25 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 790db77..6dce6f2 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1829,12 +1829,7 @@ static CPUCaches epyc_cache_info = { + + static X86CPUDefinition builtin_x86_defs[] = { + { +- /* qemu64 is the default CPU model for all *-rhel7.* machine-types. +- * The default on RHEL-6 was cpu64-rhel6. +- * libvirt assumes that qemu64 is the default for _all_ machine-types, +- * so we should try to keep qemu64 and cpu64-rhel6 as similar as +- * possible. +- */ ++ /* qemu64 is the default CPU model for all machine-types */ + .name = "qemu64", + .level = 0xd, + .vendor = CPUID_VENDOR_AMD, +@@ -2135,25 +2130,6 @@ static X86CPUDefinition builtin_x86_defs[] = { + .model_id = "Intel(R) Atom(TM) CPU N270 @ 1.60GHz", + }, + { +- .name = "cpu64-rhel6", +- .level = 4, +- .vendor = CPUID_VENDOR_AMD, +- .family = 6, +- .model = 13, +- .stepping = 3, +- .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | +- CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | +- CPUID_MCA | CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | +- CPUID_CX8 | CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | +- CPUID_PSE | CPUID_DE | CPUID_FP87, +- .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, +- .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, +- .features[FEAT_8000_0001_ECX] = CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | +- CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, +- .xlevel = 0x8000000A, +- .model_id = "QEMU Virtual CPU version (cpu64-rhel6)", +- }, +- { + .name = "Conroe", + .level = 10, + .vendor = CPUID_VENDOR_INTEL, +-- +1.8.3.1 + diff --git a/kvm-i386-Resolve-CPU-models-to-v1-by-default.patch b/kvm-i386-Resolve-CPU-models-to-v1-by-default.patch new file mode 100755 index 0000000..1027341 --- /dev/null +++ b/kvm-i386-Resolve-CPU-models-to-v1-by-default.patch @@ -0,0 +1,95 @@ +From ccda4494b0ea4b81b6b0c3e539a0bcf7e673c68c Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Thu, 5 Dec 2019 21:56:50 +0000 +Subject: [PATCH 01/18] i386: Resolve CPU models to v1 by default + +RH-Author: Eduardo Habkost +Message-id: <20191205225650.772600-2-ehabkost@redhat.com> +Patchwork-id: 92907 +O-Subject: [RHEL-AV-8.1.1 qemu-kvm PATCH 1/1] i386: Resolve CPU models to v1 by default +Bugzilla: 1787291 1779078 1779078 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1779078 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=25187823 +Upstream: submitted, Message-Id: <20191205223339.764534-1-ehabkost@redhat.com> + +When using `query-cpu-definitions` using `-machine none`, +QEMU is resolving all CPU models to their latest versions. The +actual CPU model version being used by another machine type (e.g. +`pc-q35-4.0`) might be different. + +In theory, this was OK because the correct CPU model +version is returned when using the correct `-machine` argument. + +Except that in practice, this breaks libvirt expectations: +libvirt always use `-machine none` when checking if a CPU model +is runnable, because runnability is not expected to be affected +when the machine type is changed. + +For example, when running on a Haswell host without TSX, +Haswell-v4 is runnable, but Haswell-v1 is not. On those hosts, +`query-cpu-definitions` says Haswell is runnable if using +`-machine none`, but Haswell is actually not runnable using any +of the `pc-*` machine types (because they resolve Haswell to +Haswell-v1). In other words, we're breaking the "runnability +guarantee" we promised to not break for a few releases (see +qemu-deprecated.texi). + +To address this issue, change the default CPU model version to v1 +on all machine types, so we make `query-cpu-definitions` output +when using `-machine none` match the results when using `pc-*`. +This will change in the future (the plan is to always return the +latest CPU model version if using `-machine none`), but only +after giving libvirt the opportunity to adapt. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1779078 +Signed-off-by: Eduardo Habkost +Signed-off-by: Danilo C. L. de Paula +--- + qemu-deprecated.texi | 7 +++++++ + target/i386/cpu.c | 8 +++++++- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/qemu-deprecated.texi b/qemu-deprecated.texi +index 4b4b742..534ebe9 100644 +--- a/qemu-deprecated.texi ++++ b/qemu-deprecated.texi +@@ -374,6 +374,13 @@ guarantees must resolve the CPU model aliases using te + ``alias-of'' field returned by the ``query-cpu-definitions'' QMP + command. + ++While those guarantees are kept, the return value of ++``query-cpu-definitions'' will have existing CPU model aliases ++point to a version that doesn't break runnability guarantees ++(specifically, version 1 of those CPU models). In future QEMU ++versions, aliases will point to newer CPU model versions ++depending on the machine type, so management software must ++resolve CPU model aliases before starting a virtual machine. + + @node Recently removed features + @appendix Recently removed features +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 6dce6f2..863192c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3926,7 +3926,13 @@ static PropValue tcg_default_props[] = { + }; + + +-X86CPUVersion default_cpu_version = CPU_VERSION_LATEST; ++/* ++ * We resolve CPU model aliases using -v1 when using "-machine ++ * none", but this is just for compatibility while libvirt isn't ++ * adapted to resolve CPU model versions before creating VMs. ++ * See "Runnability guarantee of CPU models" at * qemu-deprecated.texi. ++ */ ++X86CPUVersion default_cpu_version = 1; + + void x86_cpu_set_default_version(X86CPUVersion version) + { +-- +1.8.3.1 + diff --git a/kvm-ide-atapi-check-logical-block-address-and-read-size-.patch b/kvm-ide-atapi-check-logical-block-address-and-read-size-.patch new file mode 100755 index 0000000..706bd8b --- /dev/null +++ b/kvm-ide-atapi-check-logical-block-address-and-read-size-.patch @@ -0,0 +1,120 @@ +From 0453588f95294ed5ce912cb8b810a322bf9d91e0 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 25 Feb 2021 19:43:02 -0500 +Subject: [PATCH] ide: atapi: check logical block address and read size + (CVE-2020-29443) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210225194302.3137699-2-jmaloy@redhat.com> +Patchwork-id: 101208 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 1/1] ide: atapi: check logical block address and read size (CVE-2020-29443) +Bugzilla: 1917451 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Danilo de Paula +RH-Acked-by: Paolo Bonzini + +From: Prasad J Pandit + +While processing ATAPI cmd_read/cmd_read_cd commands, +Logical Block Address (LBA) maybe invalid OR closer to the last block, +leading to an OOB access issues. Add range check to avoid it. + +Fixes: CVE-2020-29443 +Reported-by: Wenxiang Qian +Suggested-by: Paolo Bonzini +Reviewed-by: Paolo Bonzini +Signed-off-by: Prasad J Pandit +Message-Id: <20210118115130.457044-1-ppandit@redhat.com> +Signed-off-by: Paolo Bonzini + +(cherry picked from commit b8d7f1bc59276fec85e4d09f1567613a3e14d31e) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/ide/atapi.c | 30 ++++++++++++++++++++++++------ + 1 file changed, 24 insertions(+), 6 deletions(-) + +diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c +index 17a9d635d8..d064935c8d 100644 +--- a/hw/ide/atapi.c ++++ b/hw/ide/atapi.c +@@ -320,6 +320,8 @@ static void ide_atapi_cmd_reply(IDEState *s, int size, int max_size) + static void ide_atapi_cmd_read_pio(IDEState *s, int lba, int nb_sectors, + int sector_size) + { ++ assert(0 <= lba && lba < (s->nb_sectors >> 2)); ++ + s->lba = lba; + s->packet_transfer_size = nb_sectors * sector_size; + s->elementary_transfer_size = 0; +@@ -418,6 +420,8 @@ eot: + static void ide_atapi_cmd_read_dma(IDEState *s, int lba, int nb_sectors, + int sector_size) + { ++ assert(0 <= lba && lba < (s->nb_sectors >> 2)); ++ + s->lba = lba; + s->packet_transfer_size = nb_sectors * sector_size; + s->io_buffer_size = 0; +@@ -971,35 +975,49 @@ static void cmd_prevent_allow_medium_removal(IDEState *s, uint8_t* buf) + + static void cmd_read(IDEState *s, uint8_t* buf) + { +- int nb_sectors, lba; ++ unsigned int nb_sectors, lba; ++ ++ /* Total logical sectors of ATAPI_SECTOR_SIZE(=2048) bytes */ ++ uint64_t total_sectors = s->nb_sectors >> 2; + + if (buf[0] == GPCMD_READ_10) { + nb_sectors = lduw_be_p(buf + 7); + } else { + nb_sectors = ldl_be_p(buf + 6); + } +- +- lba = ldl_be_p(buf + 2); + if (nb_sectors == 0) { + ide_atapi_cmd_ok(s); + return; + } + ++ lba = ldl_be_p(buf + 2); ++ if (lba >= total_sectors || lba + nb_sectors - 1 >= total_sectors) { ++ ide_atapi_cmd_error(s, ILLEGAL_REQUEST, ASC_LOGICAL_BLOCK_OOR); ++ return; ++ } ++ + ide_atapi_cmd_read(s, lba, nb_sectors, 2048); + } + + static void cmd_read_cd(IDEState *s, uint8_t* buf) + { +- int nb_sectors, lba, transfer_request; ++ unsigned int nb_sectors, lba, transfer_request; + +- nb_sectors = (buf[6] << 16) | (buf[7] << 8) | buf[8]; +- lba = ldl_be_p(buf + 2); ++ /* Total logical sectors of ATAPI_SECTOR_SIZE(=2048) bytes */ ++ uint64_t total_sectors = s->nb_sectors >> 2; + ++ nb_sectors = (buf[6] << 16) | (buf[7] << 8) | buf[8]; + if (nb_sectors == 0) { + ide_atapi_cmd_ok(s); + return; + } + ++ lba = ldl_be_p(buf + 2); ++ if (lba >= total_sectors || lba + nb_sectors - 1 >= total_sectors) { ++ ide_atapi_cmd_error(s, ILLEGAL_REQUEST, ASC_LOGICAL_BLOCK_OOR); ++ return; ++ } ++ + transfer_request = buf[9] & 0xf8; + if (transfer_request == 0x00) { + /* nothing */ +-- +2.27.0 + diff --git a/kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch b/kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch new file mode 100755 index 0000000..db89a06 --- /dev/null +++ b/kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch @@ -0,0 +1,58 @@ +From d8f84a8086dbe339a9f97dbcd10abd6379525068 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:37 -0500 +Subject: [PATCH 13/17] intel_iommu: Skip page walking on device iotlb + invalidations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-13-eperezma@redhat.com> +Patchwork-id: 100605 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 12/13] intel_iommu: Skip page walking on device iotlb invalidations +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +Although they didn't reach the notifier because of the filtering in +memory_region_notify_iommu_one, the vt-d was still splitting huge +memory invalidations in chunks. Skipping it. + +This improves performance in case of netperf with vhost-net: +* TCP_STREAM: From 1923.6Mbit/s to 2175.13Mbit/s (13%) +* TCP_RR: From 8464.73 trans/s to 8932.703333 trans/s (5.5%) +* UDP_RR: From 8562.08 trans/s to 9005.62/s (5.1%) +* UDP_STREAM: No change observed (insignificant 0.1% improvement) + +Signed-off-by: Eugenio Pérez +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-5-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit f7701e2c7983b680790af47117577b285b6a1aed) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/i386/intel_iommu.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 3640bc2ed15..2b270f06645 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -1421,6 +1421,10 @@ static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) + VTDContextEntry ce; + IOMMUNotifier *n; + ++ if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) { ++ return 0; ++ } ++ + ret = vtd_dev_to_context_entry(vtd_as->iommu_state, + pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); +-- +2.27.0 + diff --git a/kvm-introduce-kvm_kernel_irqchip_-functions.patch b/kvm-introduce-kvm_kernel_irqchip_-functions.patch new file mode 100755 index 0000000..b171749 --- /dev/null +++ b/kvm-introduce-kvm_kernel_irqchip_-functions.patch @@ -0,0 +1,281 @@ +From 3899672db472c1ca530badd49d17726a1057f8af Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Jun 2020 07:41:10 -0400 +Subject: [PATCH 40/42] kvm: introduce kvm_kernel_irqchip_* functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200605074111.2185-3-thuth@redhat.com> +Patchwork-id: 97369 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/3] kvm: introduce kvm_kernel_irqchip_* functions +Bugzilla: 1756946 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Paolo Bonzini + +The KVMState struct is opaque, so provide accessors for the fields +that will be moved from current_machine to the accelerator. For now +they just forward to the machine object, but this will change. + +Signed-off-by: Paolo Bonzini +(cherry picked from commit 4376c40dedb22530738eeb104a603e94ed03f719) + +Conflicts: + accel/kvm/kvm-all.c + (contextual conflict due to missing other commits in downstream) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + accel/kvm/kvm-all.c | 23 +++++++++++++++++++---- + hw/ppc/e500.c | 5 ++--- + hw/ppc/spapr_irq.c | 16 ++++------------ + include/sysemu/kvm.h | 7 +++++-- + target/arm/kvm.c | 8 ++++---- + target/i386/kvm.c | 4 ++-- + target/mips/kvm.c | 2 +- + target/ppc/kvm.c | 2 +- + target/s390x/kvm.c | 2 +- + 9 files changed, 39 insertions(+), 30 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 5007bdad96..b0250209f5 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -1772,7 +1772,7 @@ void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) + g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); + } + +-static void kvm_irqchip_create(MachineState *machine, KVMState *s) ++static void kvm_irqchip_create(KVMState *s) + { + int ret; + +@@ -1790,9 +1790,9 @@ static void kvm_irqchip_create(MachineState *machine, KVMState *s) + + /* First probe and see if there's a arch-specific hook to create the + * in-kernel irqchip for us */ +- ret = kvm_arch_irqchip_create(machine, s); ++ ret = kvm_arch_irqchip_create(s); + if (ret == 0) { +- if (machine_kernel_irqchip_split(machine)) { ++ if (kvm_kernel_irqchip_split()) { + perror("Split IRQ chip mode not supported."); + exit(1); + } else { +@@ -2076,7 +2076,7 @@ static int kvm_init(MachineState *ms) + } + + if (machine_kernel_irqchip_allowed(ms)) { +- kvm_irqchip_create(ms, s); ++ kvm_irqchip_create(s); + } + + if (kvm_eventfds_allowed) { +@@ -2966,6 +2966,21 @@ static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, + return false; + } + ++bool kvm_kernel_irqchip_allowed(void) ++{ ++ return machine_kernel_irqchip_allowed(current_machine); ++} ++ ++bool kvm_kernel_irqchip_required(void) ++{ ++ return machine_kernel_irqchip_required(current_machine); ++} ++ ++bool kvm_kernel_irqchip_split(void) ++{ ++ return machine_kernel_irqchip_split(current_machine); ++} ++ + static void kvm_accel_class_init(ObjectClass *oc, void *data) + { + AccelClass *ac = ACCEL_CLASS(oc); +diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c +index 91cd4c26f9..12b6a5b2a8 100644 +--- a/hw/ppc/e500.c ++++ b/hw/ppc/e500.c +@@ -793,7 +793,6 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms, + MemoryRegion *ccsr, + IrqLines *irqs) + { +- MachineState *machine = MACHINE(pms); + const PPCE500MachineClass *pmc = PPCE500_MACHINE_GET_CLASS(pms); + DeviceState *dev = NULL; + SysBusDevice *s; +@@ -801,10 +800,10 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms, + if (kvm_enabled()) { + Error *err = NULL; + +- if (machine_kernel_irqchip_allowed(machine)) { ++ if (kvm_kernel_irqchip_allowed()) { + dev = ppce500_init_mpic_kvm(pmc, irqs, &err); + } +- if (machine_kernel_irqchip_required(machine) && !dev) { ++ if (kvm_kernel_irqchip_required() && !dev) { + error_reportf_err(err, + "kernel_irqchip requested but unavailable: "); + exit(1); +diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c +index 9da423658a..f388d07bf9 100644 +--- a/hw/ppc/spapr_irq.c ++++ b/hw/ppc/spapr_irq.c +@@ -75,12 +75,11 @@ int spapr_irq_init_kvm(SpaprInterruptControllerInitKvm fn, + uint32_t nr_servers, + Error **errp) + { +- MachineState *machine = MACHINE(qdev_get_machine()); + Error *local_err = NULL; + +- if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { ++ if (kvm_enabled() && kvm_kernel_irqchip_allowed()) { + if (fn(intc, nr_servers, &local_err) < 0) { +- if (machine_kernel_irqchip_required(machine)) { ++ if (kvm_kernel_irqchip_required()) { + error_prepend(&local_err, + "kernel_irqchip requested but unavailable: "); + error_propagate(errp, local_err); +@@ -185,7 +184,7 @@ static int spapr_irq_check(SpaprMachineState *spapr, Error **errp) + */ + if (kvm_enabled() && + spapr->irq == &spapr_irq_dual && +- machine_kernel_irqchip_required(machine) && ++ kvm_kernel_irqchip_required() && + xics_kvm_has_broken_disconnect(spapr)) { + error_setg(errp, "KVM is too old to support ic-mode=dual,kernel-irqchip=on"); + return -1; +@@ -288,20 +287,13 @@ uint32_t spapr_irq_nr_msis(SpaprMachineState *spapr) + + void spapr_irq_init(SpaprMachineState *spapr, Error **errp) + { +- MachineState *machine = MACHINE(spapr); + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + +- if (machine_kernel_irqchip_split(machine)) { ++ if (kvm_enabled() && kvm_kernel_irqchip_split()) { + error_setg(errp, "kernel_irqchip split mode not supported on pseries"); + return; + } + +- if (!kvm_enabled() && machine_kernel_irqchip_required(machine)) { +- error_setg(errp, +- "kernel_irqchip requested but only available with KVM"); +- return; +- } +- + if (spapr_irq_check(spapr, errp) < 0) { + return; + } +diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h +index 9fe233b9bf..aaf2a502e8 100644 +--- a/include/sysemu/kvm.h ++++ b/include/sysemu/kvm.h +@@ -519,10 +519,13 @@ void kvm_pc_gsi_handler(void *opaque, int n, int level); + void kvm_pc_setup_irq_routing(bool pci_enabled); + void kvm_init_irq_routing(KVMState *s); + ++bool kvm_kernel_irqchip_allowed(void); ++bool kvm_kernel_irqchip_required(void); ++bool kvm_kernel_irqchip_split(void); ++ + /** + * kvm_arch_irqchip_create: + * @KVMState: The KVMState pointer +- * @MachineState: The MachineState pointer + * + * Allow architectures to create an in-kernel irq chip themselves. + * +@@ -530,7 +533,7 @@ void kvm_init_irq_routing(KVMState *s); + * 0: irq chip was not created + * > 0: irq chip was created + */ +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s); ++int kvm_arch_irqchip_create(KVMState *s); + + /** + * kvm_set_one_reg - set a register value in KVM via KVM_SET_ONE_REG ioctl +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 4be9497402..418bcedc3e 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -861,11 +861,11 @@ void kvm_arch_init_irq_routing(KVMState *s) + { + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { +- if (machine_kernel_irqchip_split(ms)) { +- perror("-machine kernel_irqchip=split is not supported on ARM."); +- exit(1); ++ if (kvm_kernel_irqchip_split()) { ++ perror("-machine kernel_irqchip=split is not supported on ARM."); ++ exit(1); + } + + /* If we can create the VGIC using the newer device control API, we +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index fcc8f7d1f3..f5c17e0028 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -4532,10 +4532,10 @@ void kvm_arch_init_irq_routing(KVMState *s) + } + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + int ret; +- if (machine_kernel_irqchip_split(ms)) { ++ if (kvm_kernel_irqchip_split()) { + ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24); + if (ret) { + error_report("Could not enable split irqchip mode: %s", +diff --git a/target/mips/kvm.c b/target/mips/kvm.c +index 578bc14625..de3e26ef1f 100644 +--- a/target/mips/kvm.c ++++ b/target/mips/kvm.c +@@ -57,7 +57,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + return 0; + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + return 0; + } +diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c +index c77f9848ec..461dc6dae1 100644 +--- a/target/ppc/kvm.c ++++ b/target/ppc/kvm.c +@@ -152,7 +152,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + return 0; + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + return 0; + } +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 84d7cadd09..c589ef9034 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -386,7 +386,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + return 0; + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + return 0; + } +-- +2.27.0 + diff --git a/kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch b/kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch new file mode 100755 index 0000000..a50bff9 --- /dev/null +++ b/kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch @@ -0,0 +1,241 @@ +From a4a984e67e276e643b8a51f39ca426d0967754a0 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 13 Jul 2020 14:24:51 -0400 +Subject: [PATCH 4/4] iotests/026: Move v3-exclusive test to new file +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-5-mreitz@redhat.com> +Patchwork-id: 97956 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 4/4] iotests/026: Move v3-exclusive test to new file +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +data_file does not work with v2, and we probably want 026 to keep +working for v2 images. Thus, open a new file for v3-exclusive error +path test cases. + +Fixes: 81311255f217859413c94f2cd9cebf2684bbda94 + (“iotests/026: Test EIO on allocation in a data-file”) +Signed-off-by: Max Reitz +Message-Id: <20200311140707.1243218-1-mreitz@redhat.com> +Reviewed-by: John Snow +Tested-by: John Snow +Signed-off-by: Max Reitz +(cherry picked from commit c264e5d2f9f5d73977eac8e5d084f727b3d07ea9) + +Conflicts: + tests/qemu-iotests/group + - As per usual. + +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/026 | 31 ----------- + tests/qemu-iotests/026.out | 6 -- + tests/qemu-iotests/026.out.nocache | 6 -- + tests/qemu-iotests/289 | 89 ++++++++++++++++++++++++++++++ + tests/qemu-iotests/289.out | 8 +++ + tests/qemu-iotests/group | 1 + + 6 files changed, 98 insertions(+), 43 deletions(-) + create mode 100755 tests/qemu-iotests/289 + create mode 100644 tests/qemu-iotests/289.out + +diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026 +index c1c96a41d9..3afd708863 100755 +--- a/tests/qemu-iotests/026 ++++ b/tests/qemu-iotests/026 +@@ -237,37 +237,6 @@ $QEMU_IO -c "write 0 $CLUSTER_SIZE" "$BLKDBG_TEST_IMG" | _filter_qemu_io + + _check_test_img + +-echo +-echo === Avoid freeing external data clusters on failure === +-echo +- +-# Similar test as the last one, except we test what happens when there +-# is an error when writing to an external data file instead of when +-# writing to a preallocated zero cluster +-_make_test_img -o "data_file=$TEST_IMG.data_file" $CLUSTER_SIZE +- +-# Put blkdebug above the data-file, and a raw node on top of that so +-# that blkdebug will see a write_aio event and emit an error +-$QEMU_IO -c "write 0 $CLUSTER_SIZE" \ +- "json:{ +- 'driver': 'qcow2', +- 'file': { 'driver': 'file', 'filename': '$TEST_IMG' }, +- 'data-file': { +- 'driver': 'raw', +- 'file': { +- 'driver': 'blkdebug', +- 'config': '$TEST_DIR/blkdebug.conf', +- 'image': { +- 'driver': 'file', +- 'filename': '$TEST_IMG.data_file' +- } +- } +- } +- }" \ +- | _filter_qemu_io +- +-_check_test_img +- + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out +index c1b3b58482..83989996ff 100644 +--- a/tests/qemu-iotests/026.out ++++ b/tests/qemu-iotests/026.out +@@ -653,10 +653,4 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. +- +-=== Avoid freeing external data clusters on failure === +- +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file +-write failed: Input/output error +-No errors were found on the image. + *** done +diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache +index 8d5001648a..9359d26d7e 100644 +--- a/tests/qemu-iotests/026.out.nocache ++++ b/tests/qemu-iotests/026.out.nocache +@@ -661,10 +661,4 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. +- +-=== Avoid freeing external data clusters on failure === +- +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file +-write failed: Input/output error +-No errors were found on the image. + *** done +diff --git a/tests/qemu-iotests/289 b/tests/qemu-iotests/289 +new file mode 100755 +index 0000000000..1c11d4030e +--- /dev/null ++++ b/tests/qemu-iotests/289 +@@ -0,0 +1,89 @@ ++#!/usr/bin/env bash ++# ++# qcow2 v3-exclusive error path testing ++# (026 tests paths common to v2 and v3) ++# ++# Copyright (C) 2020 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++seq=$(basename $0) ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++ rm "$TEST_DIR/blkdebug.conf" ++ rm -f "$TEST_IMG.data_file" ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++. ./common.pattern ++ ++_supported_fmt qcow2 ++_supported_proto file ++# This is a v3-exclusive test; ++# As for data_file, error paths often very much depend on whether ++# there is an external data file or not; so we create one exactly when ++# we want to test it ++_unsupported_imgopts 'compat=0.10' data_file ++ ++echo ++echo === Avoid freeing external data clusters on failure === ++echo ++ ++cat > "$TEST_DIR/blkdebug.conf" < +Date: Mon, 13 Jul 2020 14:24:50 -0400 +Subject: [PATCH 3/4] iotests/026: Test EIO on allocation in a data-file + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-4-mreitz@redhat.com> +Patchwork-id: 97955 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 3/4] iotests/026: Test EIO on allocation in a data-file +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +Test what happens when writing data to an external data file, where the +write requires an L2 entry to be allocated, but the data write fails. + +Signed-off-by: Max Reitz +Message-Id: <20200225143130.111267-4-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 81311255f217859413c94f2cd9cebf2684bbda94) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/026 | 32 ++++++++++++++++++++++++++++++ + tests/qemu-iotests/026.out | 6 ++++++ + tests/qemu-iotests/026.out.nocache | 6 ++++++ + 3 files changed, 44 insertions(+) + +diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026 +index d89729697f..c1c96a41d9 100755 +--- a/tests/qemu-iotests/026 ++++ b/tests/qemu-iotests/026 +@@ -30,6 +30,7 @@ _cleanup() + { + _cleanup_test_img + rm "$TEST_DIR/blkdebug.conf" ++ rm -f "$TEST_IMG.data_file" + } + trap "_cleanup; exit \$status" 0 1 2 3 15 + +@@ -236,6 +237,37 @@ $QEMU_IO -c "write 0 $CLUSTER_SIZE" "$BLKDBG_TEST_IMG" | _filter_qemu_io + + _check_test_img + ++echo ++echo === Avoid freeing external data clusters on failure === ++echo ++ ++# Similar test as the last one, except we test what happens when there ++# is an error when writing to an external data file instead of when ++# writing to a preallocated zero cluster ++_make_test_img -o "data_file=$TEST_IMG.data_file" $CLUSTER_SIZE ++ ++# Put blkdebug above the data-file, and a raw node on top of that so ++# that blkdebug will see a write_aio event and emit an error ++$QEMU_IO -c "write 0 $CLUSTER_SIZE" \ ++ "json:{ ++ 'driver': 'qcow2', ++ 'file': { 'driver': 'file', 'filename': '$TEST_IMG' }, ++ 'data-file': { ++ 'driver': 'raw', ++ 'file': { ++ 'driver': 'blkdebug', ++ 'config': '$TEST_DIR/blkdebug.conf', ++ 'image': { ++ 'driver': 'file', ++ 'filename': '$TEST_IMG.data_file' ++ } ++ } ++ } ++ }" \ ++ | _filter_qemu_io ++ ++_check_test_img ++ + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out +index 83989996ff..c1b3b58482 100644 +--- a/tests/qemu-iotests/026.out ++++ b/tests/qemu-iotests/026.out +@@ -653,4 +653,10 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. ++ ++=== Avoid freeing external data clusters on failure === ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file ++write failed: Input/output error ++No errors were found on the image. + *** done +diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache +index 9359d26d7e..8d5001648a 100644 +--- a/tests/qemu-iotests/026.out.nocache ++++ b/tests/qemu-iotests/026.out.nocache +@@ -661,4 +661,10 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. ++ ++=== Avoid freeing external data clusters on failure === ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file ++write failed: Input/output error ++No errors were found on the image. + *** done +-- +2.27.0 + diff --git a/kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch b/kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch new file mode 100755 index 0000000..36d609c --- /dev/null +++ b/kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch @@ -0,0 +1,102 @@ +From b1035096f2d46e2146704d1db9581c6d2131d1f4 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 13 Jul 2020 14:24:49 -0400 +Subject: [PATCH 2/4] iotests/026: Test EIO on preallocated zero cluster + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-3-mreitz@redhat.com> +Patchwork-id: 97953 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/4] iotests/026: Test EIO on preallocated zero cluster +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +Test what happens when writing data to a preallocated zero cluster, but +the data write fails. + +Signed-off-by: Max Reitz +Message-Id: <20200225143130.111267-3-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 31ab00f3747c00fdbb9027cea644b40dd1405480) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/026 | 21 +++++++++++++++++++++ + tests/qemu-iotests/026.out | 10 ++++++++++ + tests/qemu-iotests/026.out.nocache | 10 ++++++++++ + 3 files changed, 41 insertions(+) + +diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026 +index 3430029ed6..d89729697f 100755 +--- a/tests/qemu-iotests/026 ++++ b/tests/qemu-iotests/026 +@@ -215,6 +215,27 @@ _make_test_img 64M + $QEMU_IO -c "write 0 1M" -c "write 0 1M" "$BLKDBG_TEST_IMG" | _filter_qemu_io + _check_test_img + ++echo ++echo === Avoid freeing preallocated zero clusters on failure === ++echo ++ ++cat > "$TEST_DIR/blkdebug.conf" < +Date: Wed, 3 Jun 2020 16:03:17 +0100 +Subject: [PATCH 18/26] iotests/055: refactor compressed backup to vmdk + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-4-kwolf@redhat.com> +Patchwork-id: 97104 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 03/11] iotests/055: refactor compressed backup to vmdk +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: Vladimir Sementsov-Ogievskiy + +Instead of looping in each test, let's better refactor vmdk target case +as a subclass. + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200430124713.3067-6-vsementsov@virtuozzo.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 8e8372944e5e097e98844b4db10f867689065e16) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 70 ++++++++++++++++++++++++---------------------- + tests/qemu-iotests/055.out | 4 +-- + 2 files changed, 39 insertions(+), 35 deletions(-) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index eb50c9f..8666601 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -450,10 +450,9 @@ class TestSingleTransaction(iotests.QMPTestCase): + self.assert_no_active_block_jobs() + + +-class TestDriveCompression(iotests.QMPTestCase): ++class TestCompressedToQcow2(iotests.QMPTestCase): + image_len = 64 * 1024 * 1024 # MB +- fmt_supports_compression = [{'type': 'qcow2', 'args': ()}, +- {'type': 'vmdk', 'args': ('-o', 'subformat=streamOptimized')}] ++ target_fmt = {'type': 'qcow2', 'args': ()} + + def tearDown(self): + self.vm.shutdown() +@@ -463,19 +462,20 @@ class TestDriveCompression(iotests.QMPTestCase): + except OSError: + pass + +- def do_prepare_drives(self, fmt, args, attach_target): ++ def do_prepare_drives(self, attach_target): + self.vm = iotests.VM().add_drive('blkdebug::' + test_img) + +- qemu_img('create', '-f', fmt, blockdev_target_img, +- str(TestDriveCompression.image_len), *args) ++ qemu_img('create', '-f', self.target_fmt['type'], blockdev_target_img, ++ str(self.image_len), *self.target_fmt['args']) + if attach_target: + self.vm.add_drive(blockdev_target_img, +- img_format=fmt, interface="none") ++ img_format=self.target_fmt['type'], ++ interface="none") + + self.vm.launch() + +- def do_test_compress_complete(self, cmd, format, attach_target, **args): +- self.do_prepare_drives(format['type'], format['args'], attach_target) ++ def do_test_compress_complete(self, cmd, attach_target, **args): ++ self.do_prepare_drives(attach_target) + + self.assert_no_active_block_jobs() + +@@ -486,21 +486,21 @@ class TestDriveCompression(iotests.QMPTestCase): + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, blockdev_target_img, +- iotests.imgfmt, format['type']), ++ iotests.imgfmt, ++ self.target_fmt['type']), + 'target image does not match source after backup') + + def test_complete_compress_drive_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_complete('drive-backup', format, False, +- target=blockdev_target_img, mode='existing') ++ self.do_test_compress_complete('drive-backup', False, ++ target=blockdev_target_img, ++ mode='existing') + + def test_complete_compress_blockdev_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_complete('blockdev-backup', format, True, +- target='drive1') ++ self.do_test_compress_complete('blockdev-backup', ++ True, target='drive1') + +- def do_test_compress_cancel(self, cmd, format, attach_target, **args): +- self.do_prepare_drives(format['type'], format['args'], attach_target) ++ def do_test_compress_cancel(self, cmd, attach_target, **args): ++ self.do_prepare_drives(attach_target) + + self.assert_no_active_block_jobs() + +@@ -514,17 +514,16 @@ class TestDriveCompression(iotests.QMPTestCase): + self.vm.shutdown() + + def test_compress_cancel_drive_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_cancel('drive-backup', format, False, +- target=blockdev_target_img, mode='existing') ++ self.do_test_compress_cancel('drive-backup', False, ++ target=blockdev_target_img, ++ mode='existing') + + def test_compress_cancel_blockdev_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_cancel('blockdev-backup', format, True, +- target='drive1') ++ self.do_test_compress_cancel('blockdev-backup', True, ++ target='drive1') + +- def do_test_compress_pause(self, cmd, format, attach_target, **args): +- self.do_prepare_drives(format['type'], format['args'], attach_target) ++ def do_test_compress_pause(self, cmd, attach_target, **args): ++ self.do_prepare_drives(attach_target) + + self.assert_no_active_block_jobs() + +@@ -550,18 +549,23 @@ class TestDriveCompression(iotests.QMPTestCase): + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, blockdev_target_img, +- iotests.imgfmt, format['type']), ++ iotests.imgfmt, ++ self.target_fmt['type']), + 'target image does not match source after backup') + + def test_compress_pause_drive_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_pause('drive-backup', format, False, +- target=blockdev_target_img, mode='existing') ++ self.do_test_compress_pause('drive-backup', False, ++ target=blockdev_target_img, ++ mode='existing') + + def test_compress_pause_blockdev_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_pause('blockdev-backup', format, True, +- target='drive1') ++ self.do_test_compress_pause('blockdev-backup', True, ++ target='drive1') ++ ++ ++class TestCompressedToVmdk(TestCompressedToQcow2): ++ target_fmt = {'type': 'vmdk', 'args': ('-o', 'subformat=streamOptimized')} ++ + + if __name__ == '__main__': + iotests.main(supported_fmts=['raw', 'qcow2'], +diff --git a/tests/qemu-iotests/055.out b/tests/qemu-iotests/055.out +index 5ce2f9a..5c26d15 100644 +--- a/tests/qemu-iotests/055.out ++++ b/tests/qemu-iotests/055.out +@@ -1,5 +1,5 @@ +-.............................. ++.................................... + ---------------------------------------------------------------------- +-Ran 30 tests ++Ran 36 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch b/kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch new file mode 100755 index 0000000..260d511 --- /dev/null +++ b/kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch @@ -0,0 +1,45 @@ +From 9a0ca4797cbd029dab9209d88f8c81b78ded8fd0 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:18 +0100 +Subject: [PATCH 19/26] iotests/055: skip vmdk target tests if vmdk is not + whitelisted + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-5-kwolf@redhat.com> +Patchwork-id: 97101 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 04/11] iotests/055: skip vmdk target tests if vmdk is not whitelisted +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: Vladimir Sementsov-Ogievskiy + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200430124713.3067-7-vsementsov@virtuozzo.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 761cd2e791eae38c3d08ea5f83309ce58bb85ff7) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index 8666601..c9cdc06 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -566,6 +566,10 @@ class TestCompressedToQcow2(iotests.QMPTestCase): + class TestCompressedToVmdk(TestCompressedToQcow2): + target_fmt = {'type': 'vmdk', 'args': ('-o', 'subformat=streamOptimized')} + ++ @iotests.skip_if_unsupported(['vmdk']) ++ def setUp(self): ++ pass ++ + + if __name__ == '__main__': + iotests.main(supported_fmts=['raw', 'qcow2'], +-- +1.8.3.1 + diff --git a/kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch b/kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch new file mode 100755 index 0000000..c71bcba --- /dev/null +++ b/kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch @@ -0,0 +1,387 @@ +From 2202321b549dda551190d919a5a1cbee0fab8c90 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:22 +0100 +Subject: [PATCH 23/26] iotests/109: Don't mirror with mismatched size + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-9-kwolf@redhat.com> +Patchwork-id: 97105 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 08/11] iotests/109: Don't mirror with mismatched size +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +This patch makes the raw image the same size as the file in a different +format that is mirrored as raw to it to avoid errors when mirror starts +to enforce that source and target are the same size. + +We check only that the first 512 bytes are zeroed (instead of 64k) +because some image formats create image files that are smaller than 64k, +so trying to read 64k would result in I/O errors. Apart from this, 512 +is more appropriate anyway because the raw format driver protects +specifically the first 512 bytes. + +Signed-off-by: Kevin Wolf +Message-Id: <20200511135825.219437-2-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit ffa41a62d0b0e6d91f2071328befa046d56993e1) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/109 | 10 +++--- + tests/qemu-iotests/109.out | 74 +++++++++++++++++----------------------- + tests/qemu-iotests/common.filter | 5 +++ + 3 files changed, 41 insertions(+), 48 deletions(-) + +diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109 +index 9897ceb..190c35e 100755 +--- a/tests/qemu-iotests/109 ++++ b/tests/qemu-iotests/109 +@@ -76,14 +76,14 @@ for fmt in qcow qcow2 qed vdi vmdk vpc; do + echo "=== Writing a $fmt header into raw ===" + echo + +- _make_test_img 64M + TEST_IMG="$TEST_IMG.src" IMGFMT=$fmt _make_test_img 64M ++ _make_test_img $(du -b "$TEST_IMG.src" | cut -f1) | _filter_img_create_size + + # This first test should fail: The image format was probed, we may not + # write an image header at the start of the image + run_qemu "$TEST_IMG" "$TEST_IMG.src" "" "BLOCK_JOB_ERROR" | + _filter_block_job_len +- $QEMU_IO -c 'read -P 0 0 64k' "$TEST_IMG" | _filter_qemu_io ++ $QEMU_IO -c 'read -P 0 0 512' "$TEST_IMG" | _filter_qemu_io + + + # When raw was explicitly specified, the same must succeed +@@ -102,12 +102,12 @@ for sample_img in empty.bochs iotest-dirtylog-10G-4M.vhdx parallels-v1 \ + + # Can't use _use_sample_img because that isn't designed to be used multiple + # times and it overwrites $TEST_IMG (both breaks cleanup) +- _make_test_img 64M + bzcat "$SAMPLE_IMG_DIR/$sample_img.bz2" > "$TEST_IMG.src" ++ _make_test_img $(du -b "$TEST_IMG.src" | cut -f1) | _filter_img_create_size + + run_qemu "$TEST_IMG" "$TEST_IMG.src" "" "BLOCK_JOB_ERROR" | + _filter_block_job_offset | _filter_block_job_len +- $QEMU_IO -c 'read -P 0 0 64k' "$TEST_IMG" | _filter_qemu_io ++ $QEMU_IO -c 'read -P 0 0 512' "$TEST_IMG" | _filter_qemu_io + + run_qemu "$TEST_IMG" "$TEST_IMG.src" "'format': 'raw'," "BLOCK_JOB_READY" + $QEMU_IMG compare -f raw -F raw "$TEST_IMG" "$TEST_IMG.src" +@@ -118,8 +118,8 @@ echo "=== Write legitimate MBR into raw ===" + echo + + for sample_img in grub_mbr.raw; do +- _make_test_img 64M + bzcat "$SAMPLE_IMG_DIR/$sample_img.bz2" > "$TEST_IMG.src" ++ _make_test_img $(du -b "$TEST_IMG.src" | cut -f1) | _filter_img_create_size + + run_qemu "$TEST_IMG" "$TEST_IMG.src" "" "BLOCK_JOB_READY" + $QEMU_IMG compare -f raw -F raw "$TEST_IMG" "$TEST_IMG.src" +diff --git a/tests/qemu-iotests/109.out b/tests/qemu-iotests/109.out +index 884f65f..ad739df 100644 +--- a/tests/qemu-iotests/109.out ++++ b/tests/qemu-iotests/109.out +@@ -2,8 +2,8 @@ QA output created by 109 + + === Writing a qcow header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -23,8 +23,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -43,13 +43,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a qcow2 header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -69,8 +68,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -89,13 +88,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 197120, "offset": 197120, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a qed header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -115,8 +113,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -135,13 +133,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a vdi header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -161,8 +158,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -181,13 +178,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a vmdk header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -207,8 +203,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -227,13 +223,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a vpc header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -253,8 +248,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -273,12 +268,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image empty.bochs into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -298,8 +292,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -318,12 +312,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image iotest-dirtylog-10G-4M.vhdx into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -343,8 +336,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -363,12 +356,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 31457280, "offset": 31457280, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image parallels-v1 into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -388,8 +380,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -408,12 +400,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image simple-pattern.cloop into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -433,8 +424,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -453,12 +444,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2048, "offset": 2048, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Write legitimate MBR into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -480,7 +470,6 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + { 'execute': 'qmp_capabilities' } + {"return": {}} +@@ -500,6 +489,5 @@ Images are identical. + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + *** done +diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter +index 5367dee..c8e8663 100644 +--- a/tests/qemu-iotests/common.filter ++++ b/tests/qemu-iotests/common.filter +@@ -149,6 +149,11 @@ _filter_img_create() + -e "s# force_size=\\(on\\|off\\)##g" + } + ++_filter_img_create_size() ++{ ++ $SED -e "s# size=[0-9]\\+# size=SIZE#g" ++} ++ + _filter_img_info() + { + if [[ "$1" == "--format-specific" ]]; then +-- +1.8.3.1 + diff --git a/kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch b/kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch new file mode 100755 index 0000000..ef8807c --- /dev/null +++ b/kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch @@ -0,0 +1,120 @@ +From 104c8f6210bf573cf39c2a14cdb0b081baaaa3f0 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:23 +0100 +Subject: [PATCH 24/26] iotests/229: Use blkdebug to inject an error + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-10-kwolf@redhat.com> +Patchwork-id: 97108 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 09/11] iotests/229: Use blkdebug to inject an error +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +229 relies on the mirror running into an I/O error when the target is +smaller than the source. After changing mirror to catch this condition +while starting the job, this test case won't get a job that is paused +for an I/O error any more. Use blkdebug instead to inject an error. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200511135825.219437-3-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit d89ac3cf305b28c024a76805a84d75c0ee1e786f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/229 | 18 +++++++++++++----- + tests/qemu-iotests/229.out | 6 +++--- + 2 files changed, 16 insertions(+), 8 deletions(-) + +diff --git a/tests/qemu-iotests/229 b/tests/qemu-iotests/229 +index e18a464..511fbc0 100755 +--- a/tests/qemu-iotests/229 ++++ b/tests/qemu-iotests/229 +@@ -32,6 +32,7 @@ _cleanup() + _cleanup_qemu + _cleanup_test_img + rm -f "$TEST_IMG" "$DEST_IMG" ++ rm -f "$TEST_DIR/blkdebug.conf" + } + trap "_cleanup; exit \$status" 0 1 2 3 15 + +@@ -48,11 +49,10 @@ _supported_os Linux + + DEST_IMG="$TEST_DIR/d.$IMGFMT" + TEST_IMG="$TEST_DIR/b.$IMGFMT" ++BLKDEBUG_CONF="$TEST_DIR/blkdebug.conf" + + _make_test_img 2M +- +-# destination for mirror will be too small, causing error +-TEST_IMG=$DEST_IMG _make_test_img 1M ++TEST_IMG=$DEST_IMG _make_test_img 2M + + $QEMU_IO -c 'write 0 2M' "$TEST_IMG" | _filter_qemu_io + +@@ -66,11 +66,18 @@ echo + echo '=== Starting drive-mirror, causing error & stop ===' + echo + ++cat > "$BLKDEBUG_CONF" < +Date: Fri, 13 Mar 2020 12:34:38 +0000 +Subject: [PATCH 18/20] iotests: Add iothread cases to 155 + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-13-kwolf@redhat.com> +Patchwork-id: 94289 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 12/13] iotests: Add iothread cases to 155 +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +This patch adds test cases for attaching the backing chain to a mirror +job target right before finalising the job, where the image is in a +non-mainloop AioContext (i.e. the backing chain needs to be moved to the +AioContext of the mirror target). + +This requires switching the test case from virtio-blk to virtio-scsi +because virtio-blk only actually starts using the iothreads when the +guest driver initialises the device (which never happens in a test case +without a guest OS). virtio-scsi always keeps its block nodes in the +AioContext of the the requested iothread without guest interaction. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-7-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit 6a5f6403a11307794ec79d277a065c137cfc12b2) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/155 | 32 +++++++++++++++++++++++--------- + tests/qemu-iotests/155.out | 4 ++-- + 2 files changed, 25 insertions(+), 11 deletions(-) + +diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 +index 3053e50..b552d1f 100755 +--- a/tests/qemu-iotests/155 ++++ b/tests/qemu-iotests/155 +@@ -49,11 +49,14 @@ target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt) + # chain opened right away. If False, blockdev-add + # opens it without a backing file and job completion + # is supposed to open the backing chain. ++# use_iothread: If True, an iothread is configured for the virtio-blk device ++# that uses the image being mirrored + + class BaseClass(iotests.QMPTestCase): + target_blockdev_backing = None + target_real_backing = None + target_open_with_backing = True ++ use_iothread = False + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, back0_img, '1440K') +@@ -69,7 +72,16 @@ class BaseClass(iotests.QMPTestCase): + 'file': {'driver': 'file', + 'filename': source_img}} + self.vm.add_blockdev(self.vm.qmp_to_opts(blockdev)) +- self.vm.add_device('virtio-blk,id=qdev0,drive=source') ++ ++ if self.use_iothread: ++ self.vm.add_object('iothread,id=iothread0') ++ iothread = ",iothread=iothread0" ++ else: ++ iothread = "" ++ ++ self.vm.add_device('virtio-scsi%s' % iothread) ++ self.vm.add_device('scsi-hd,id=qdev0,drive=source') ++ + self.vm.launch() + + self.assertIntactSourceBackingChain() +@@ -182,24 +194,21 @@ class MirrorBaseClass(BaseClass): + def testFull(self): + self.runMirror('full') + +- node = self.findBlockNode('target', +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode('target', 'qdev0') + self.assertCorrectBackingImage(node, None) + self.assertIntactSourceBackingChain() + + def testTop(self): + self.runMirror('top') + +- node = self.findBlockNode('target', +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode('target', 'qdev0') + self.assertCorrectBackingImage(node, back2_img) + self.assertIntactSourceBackingChain() + + def testNone(self): + self.runMirror('none') + +- node = self.findBlockNode('target', +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode('target', 'qdev0') + self.assertCorrectBackingImage(node, source_img) + self.assertIntactSourceBackingChain() + +@@ -252,6 +261,9 @@ class TestBlockdevMirrorReopen(MirrorBaseClass): + backing="backing") + self.assert_qmp(result, 'return', {}) + ++class TestBlockdevMirrorReopenIothread(TestBlockdevMirrorReopen): ++ use_iothread = True ++ + # Attach the backing chain only during completion, with blockdev-snapshot + class TestBlockdevMirrorSnapshot(MirrorBaseClass): + cmd = 'blockdev-mirror' +@@ -268,6 +280,9 @@ class TestBlockdevMirrorSnapshot(MirrorBaseClass): + overlay="target") + self.assert_qmp(result, 'return', {}) + ++class TestBlockdevMirrorSnapshotIothread(TestBlockdevMirrorSnapshot): ++ use_iothread = True ++ + class TestCommit(BaseClass): + existing = False + +@@ -283,8 +298,7 @@ class TestCommit(BaseClass): + + self.vm.event_wait('BLOCK_JOB_COMPLETED') + +- node = self.findBlockNode(None, +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode(None, 'qdev0') + self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename', + back1_img) + self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename', +diff --git a/tests/qemu-iotests/155.out b/tests/qemu-iotests/155.out +index 4fd1c2d..ed714d5 100644 +--- a/tests/qemu-iotests/155.out ++++ b/tests/qemu-iotests/155.out +@@ -1,5 +1,5 @@ +-......................... ++............................... + ---------------------------------------------------------------------- +-Ran 25 tests ++Ran 31 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch b/kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch new file mode 100755 index 0000000..6bdf130 --- /dev/null +++ b/kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch @@ -0,0 +1,236 @@ +From adda561394bb07c13ef3f2712b36704790530891 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:15 +0100 +Subject: [PATCH 16/26] iotests: Add more "skip_if_unsupported" statements to + the python tests + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-2-kwolf@redhat.com> +Patchwork-id: 97099 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 01/11] iotests: Add more "skip_if_unsupported" statements to the python tests +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: Thomas Huth + +The python code already contains a possibility to skip tests if the +corresponding driver is not available in the qemu binary - use it +in more spots to avoid that the tests are failing if the driver has +been disabled. + +While we're at it, we can now also remove some of the old checks that +were using iotests.supports_quorum() - and which were apparently not +working as expected since the tests aborted instead of being skipped +when "quorum" was missing in the QEMU binary. + +Signed-off-by: Thomas Huth +Signed-off-by: Kevin Wolf +(cherry picked from commit 9442bebe6e67a5d038bbf2572b79e7b59d202a23) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/030 | 4 +--- + tests/qemu-iotests/040 | 2 ++ + tests/qemu-iotests/041 | 39 +++------------------------------------ + tests/qemu-iotests/245 | 2 ++ + 4 files changed, 8 insertions(+), 39 deletions(-) + +diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 +index f3766f2..bddbb30 100755 +--- a/tests/qemu-iotests/030 ++++ b/tests/qemu-iotests/030 +@@ -530,6 +530,7 @@ class TestQuorum(iotests.QMPTestCase): + children = [] + backing = [] + ++ @iotests.skip_if_unsupported(['quorum']) + def setUp(self): + opts = ['driver=quorum', 'vote-threshold=2'] + +@@ -560,9 +561,6 @@ class TestQuorum(iotests.QMPTestCase): + os.remove(img) + + def test_stream_quorum(self): +- if not iotests.supports_quorum(): +- return +- + self.assertNotEqual(qemu_io('-f', iotests.imgfmt, '-rU', '-c', 'map', self.children[0]), + qemu_io('-f', iotests.imgfmt, '-rU', '-c', 'map', self.backing[0]), + 'image file map matches backing file before streaming') +diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 +index 762ad1e..74f62c3 100755 +--- a/tests/qemu-iotests/040 ++++ b/tests/qemu-iotests/040 +@@ -106,6 +106,7 @@ class TestSingleDrive(ImageCommitTestCase): + self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) + self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed")) + ++ @iotests.skip_if_unsupported(['throttle']) + def test_commit_with_filter_and_quit(self): + result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg') + self.assert_qmp(result, 'return', {}) +@@ -125,6 +126,7 @@ class TestSingleDrive(ImageCommitTestCase): + self.has_quit = True + + # Same as above, but this time we add the filter after starting the job ++ @iotests.skip_if_unsupported(['throttle']) + def test_commit_plus_filter_and_quit(self): + result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg') + self.assert_qmp(result, 'return', {}) +diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 +index 8568426..a543b15 100755 +--- a/tests/qemu-iotests/041 ++++ b/tests/qemu-iotests/041 +@@ -871,6 +871,7 @@ class TestRepairQuorum(iotests.QMPTestCase): + image_len = 1 * 1024 * 1024 # MB + IMAGES = [ quorum_img1, quorum_img2, quorum_img3 ] + ++ @iotests.skip_if_unsupported(['quorum']) + def setUp(self): + self.vm = iotests.VM() + +@@ -891,9 +892,8 @@ class TestRepairQuorum(iotests.QMPTestCase): + #assemble the quorum block device from the individual files + args = { "driver": "quorum", "node-name": "quorum0", + "vote-threshold": 2, "children": [ "img0", "img1", "img2" ] } +- if iotests.supports_quorum(): +- result = self.vm.qmp("blockdev-add", **args) +- self.assert_qmp(result, 'return', {}) ++ result = self.vm.qmp("blockdev-add", **args) ++ self.assert_qmp(result, 'return', {}) + + + def tearDown(self): +@@ -906,9 +906,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + pass + + def test_complete(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -925,9 +922,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + 'target image does not match source after mirroring') + + def test_cancel(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -942,9 +936,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.vm.shutdown() + + def test_cancel_after_ready(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -961,9 +952,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + 'target image does not match source after mirroring') + + def test_pause(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -989,9 +977,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + 'target image does not match source after mirroring') + + def test_medium_not_found(self): +- if not iotests.supports_quorum(): +- return +- + if iotests.qemu_default_machine != 'pc': + return + +@@ -1003,9 +988,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_image_not_found(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', + sync='full', node_name='repair0', replaces='img1', + mode='existing', target=quorum_repair_img, +@@ -1013,9 +995,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_device_not_found(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', + device='nonexistent', sync='full', + node_name='repair0', +@@ -1024,9 +1003,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_wrong_sync_mode(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', device='quorum0', job_id='job0', + node_name='repair0', + replaces='img1', +@@ -1034,27 +1010,18 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_no_node_name(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', + sync='full', replaces='img1', + target=quorum_repair_img, format=iotests.imgfmt) + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_nonexistent_replaces(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', + sync='full', node_name='repair0', replaces='img77', + target=quorum_repair_img, format=iotests.imgfmt) + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_after_a_quorum_snapshot(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('blockdev-snapshot-sync', node_name='img1', + snapshot_file=quorum_snapshot_file, + snapshot_node_name="snap1"); +diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 +index 919131d..ed972f9 100644 +--- a/tests/qemu-iotests/245 ++++ b/tests/qemu-iotests/245 +@@ -478,6 +478,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): + # This test verifies that we can't change the children of a block + # device during a reopen operation in a way that would create + # cycles in the node graph ++ @iotests.skip_if_unsupported(['blkverify']) + def test_graph_cycles(self): + opts = [] + +@@ -534,6 +535,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): + self.assert_qmp(result, 'return', {}) + + # Misc reopen tests with different block drivers ++ @iotests.skip_if_unsupported(['quorum', 'throttle']) + def test_misc_drivers(self): + #################### + ###### quorum ###### +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-qemu_io_log.patch b/kvm-iotests-Add-qemu_io_log.patch new file mode 100755 index 0000000..a65bc5a --- /dev/null +++ b/kvm-iotests-Add-qemu_io_log.patch @@ -0,0 +1,48 @@ +From 2be333e847c01397bb6a92b2e4c60e904957675d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:37 +0100 +Subject: [PATCH 09/17] iotests: Add qemu_io_log() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-9-kwolf@redhat.com> +Patchwork-id: 97451 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 08/11] iotests: Add qemu_io_log() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Add a function that runs qemu-io and logs the output with the +appropriate filters applied. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit a96f0350e3d95c98f2bff1863d14493af5c1d360) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index be20d56..7a9c779 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -162,6 +162,11 @@ def qemu_io(*args): + sys.stderr.write('qemu-io received signal %i: %s\n' % (-exitcode, ' '.join(args))) + return subp.communicate()[0] + ++def qemu_io_log(*args): ++ result = qemu_io(*args) ++ log(result, filters=[filter_testfiles, filter_qemu_io]) ++ return result ++ + def qemu_io_silent(*args): + '''Run qemu-io and return the exit code, suppressing stdout''' + args = qemu_io_args + list(args) +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch b/kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch new file mode 100755 index 0000000..6144043 --- /dev/null +++ b/kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch @@ -0,0 +1,253 @@ +From eccae2f252513d2965ef919022c3ed068da275bd Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:20 +0100 +Subject: [PATCH 15/26] iotests: Add test 291 to for qemu-img bitmap coverage + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-13-eblake@redhat.com> +Patchwork-id: 97079 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 12/12] iotests: Add test 291 to for qemu-img bitmap coverage +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Add a new test covering the 'qemu-img bitmap' subcommand, as well as +'qemu-img convert --bitmaps', both added in recent patches. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200521192137.1120211-6-eblake@redhat.com> +(cherry picked from commit cf2d1203dcfc2bf964453d83a2302231ce77f2dc) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + tests/qemu-iotests/group - context: other tests not backported + tests/qemu-iotests/291.out - zstd compression not backported +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/291 | 112 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/291.out | 78 +++++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 191 insertions(+) + create mode 100755 tests/qemu-iotests/291 + create mode 100644 tests/qemu-iotests/291.out + +diff --git a/tests/qemu-iotests/291 b/tests/qemu-iotests/291 +new file mode 100755 +index 0000000..3ca83b9 +--- /dev/null ++++ b/tests/qemu-iotests/291 +@@ -0,0 +1,112 @@ ++#!/usr/bin/env bash ++# ++# Test qemu-img bitmap handling ++# ++# Copyright (C) 2018-2020 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++seq="$(basename $0)" ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++ nbd_server_stop ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++. ./common.nbd ++ ++_supported_fmt qcow2 ++_supported_proto file ++_supported_os Linux ++_require_command QEMU_NBD ++ ++echo ++echo "=== Initial image setup ===" ++echo ++ ++# Create backing image with one bitmap ++TEST_IMG="$TEST_IMG.base" _make_test_img 10M ++$QEMU_IMG bitmap --add -f $IMGFMT "$TEST_IMG.base" b0 ++$QEMU_IO -c 'w 3M 1M' -f $IMGFMT "$TEST_IMG.base" | _filter_qemu_io ++ ++# Create initial image and populate two bitmaps: one active, one inactive. ++ORIG_IMG=$TEST_IMG ++TEST_IMG=$TEST_IMG.orig ++_make_test_img -b "$ORIG_IMG.base" -F $IMGFMT 10M ++$QEMU_IO -c 'w 0 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++$QEMU_IMG bitmap --add -g 512k -f $IMGFMT "$TEST_IMG" b1 ++$QEMU_IMG bitmap --add --disable -f $IMGFMT "$TEST_IMG" b2 ++$QEMU_IO -c 'w 3M 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++$QEMU_IMG bitmap --clear -f $IMGFMT "$TEST_IMG" b1 ++$QEMU_IO -c 'w 1M 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++$QEMU_IMG bitmap --disable -f $IMGFMT "$TEST_IMG" b1 ++$QEMU_IMG bitmap --enable -f $IMGFMT "$TEST_IMG" b2 ++$QEMU_IO -c 'w 2M 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++ ++echo ++echo "=== Bitmap preservation not possible to non-qcow2 ===" ++echo ++ ++TEST_IMG=$ORIG_IMG ++$QEMU_IMG convert --bitmaps -O raw "$TEST_IMG.orig" "$TEST_IMG" && ++ echo "unexpected success" ++ ++echo ++echo "=== Convert with bitmap preservation ===" ++echo ++ ++# Only bitmaps from the active layer are copied ++$QEMU_IMG convert --bitmaps -O qcow2 "$TEST_IMG.orig" "$TEST_IMG" ++$QEMU_IMG info "$TEST_IMG" | _filter_img_info --format-specific ++# But we can also merge in bitmaps from other layers. This test is a bit ++# contrived to cover more code paths, in reality, you could merge directly ++# into b0 without going through tmp ++$QEMU_IMG bitmap --add --disable -f $IMGFMT "$TEST_IMG" b0 ++$QEMU_IMG bitmap --add --merge b0 -b "$TEST_IMG.base" -F $IMGFMT \ ++ -f $IMGFMT "$TEST_IMG" tmp ++$QEMU_IMG bitmap --merge tmp -f $IMGFMT "$TEST_IMG" b0 ++$QEMU_IMG bitmap --remove --image-opts \ ++ driver=$IMGFMT,file.driver=file,file.filename="$TEST_IMG" tmp ++$QEMU_IMG info "$TEST_IMG" | _filter_img_info --format-specific ++ ++echo ++echo "=== Check bitmap contents ===" ++echo ++ ++# x-dirty-bitmap is a hack for reading bitmaps; it abuses block status to ++# report "data":false for portions of the bitmap which are set ++IMG="driver=nbd,server.type=unix,server.path=$nbd_unix_socket" ++nbd_server_start_unix_socket -r -f qcow2 -B b0 "$TEST_IMG" ++$QEMU_IMG map --output=json --image-opts \ ++ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b0" | _filter_qemu_img_map ++nbd_server_start_unix_socket -r -f qcow2 -B b1 "$TEST_IMG" ++$QEMU_IMG map --output=json --image-opts \ ++ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b1" | _filter_qemu_img_map ++nbd_server_start_unix_socket -r -f qcow2 -B b2 "$TEST_IMG" ++$QEMU_IMG map --output=json --image-opts \ ++ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b2" | _filter_qemu_img_map ++ ++# success, all done ++echo '*** done' ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/291.out b/tests/qemu-iotests/291.out +new file mode 100644 +index 0000000..14e5cfc +--- /dev/null ++++ b/tests/qemu-iotests/291.out +@@ -0,0 +1,78 @@ ++QA output created by 291 ++ ++=== Initial image setup === ++ ++Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=10485760 ++wrote 1048576/1048576 bytes at offset 3145728 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++Formatting 'TEST_DIR/t.IMGFMT.orig', fmt=IMGFMT size=10485760 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT ++wrote 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 1048576/1048576 bytes at offset 3145728 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 1048576/1048576 bytes at offset 2097152 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Bitmap preservation not possible to non-qcow2 === ++ ++qemu-img: Format driver 'raw' does not support bitmaps ++ ++=== Convert with bitmap preservation === ++ ++image: TEST_DIR/t.IMGFMT ++file format: IMGFMT ++virtual size: 10 MiB (10485760 bytes) ++disk size: 4.39 MiB ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ bitmaps: ++ [0]: ++ flags: ++ name: b1 ++ granularity: 524288 ++ [1]: ++ flags: ++ [0]: auto ++ name: b2 ++ granularity: 65536 ++ refcount bits: 16 ++ corrupt: false ++image: TEST_DIR/t.IMGFMT ++file format: IMGFMT ++virtual size: 10 MiB (10485760 bytes) ++disk size: 4.48 MiB ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ bitmaps: ++ [0]: ++ flags: ++ name: b1 ++ granularity: 524288 ++ [1]: ++ flags: ++ [0]: auto ++ name: b2 ++ granularity: 65536 ++ [2]: ++ flags: ++ name: b0 ++ granularity: 65536 ++ refcount bits: 16 ++ corrupt: false ++ ++=== Check bitmap contents === ++ ++[{ "start": 0, "length": 3145728, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 3145728, "length": 1048576, "depth": 0, "zero": false, "data": false}, ++{ "start": 4194304, "length": 6291456, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] ++[{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 1048576, "length": 1048576, "depth": 0, "zero": false, "data": false}, ++{ "start": 2097152, "length": 8388608, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] ++[{ "start": 0, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 2097152, "length": 1048576, "depth": 0, "zero": false, "data": false}, ++{ "start": 3145728, "length": 7340032, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] ++*** done +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 9c565cf..033b54d 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -290,3 +290,4 @@ + 280 rw migration quick + 281 rw quick + 284 rw ++291 rw quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-test-for-image-creation-fallback.patch b/kvm-iotests-Add-test-for-image-creation-fallback.patch new file mode 100755 index 0000000..a8ea8f7 --- /dev/null +++ b/kvm-iotests-Add-test-for-image-creation-fallback.patch @@ -0,0 +1,138 @@ +From 55f3a02574da226299d99bd74d12dd91b0f228dc Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:46 +0000 +Subject: [PATCH 05/20] iotests: Add test for image creation fallback + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-6-mlevitsk@redhat.com> +Patchwork-id: 94228 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 5/6] iotests: Add test for image creation fallback +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-6-mreitz@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Maxim Levitsky +[mreitz: Added a note that NBD does not support resizing, which is why + the second case is expected to fail] +Signed-off-by: Max Reitz +(cherry picked from commit 4dddeac115c5a2c5f74731fda0afd031a0b45490) +Signed-off-by: Maxim Levitsky + +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/259 | 62 ++++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/259.out | 14 +++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 77 insertions(+) + create mode 100755 tests/qemu-iotests/259 + create mode 100644 tests/qemu-iotests/259.out + +diff --git a/tests/qemu-iotests/259 b/tests/qemu-iotests/259 +new file mode 100755 +index 0000000..62e29af +--- /dev/null ++++ b/tests/qemu-iotests/259 +@@ -0,0 +1,62 @@ ++#!/usr/bin/env bash ++# ++# Test generic image creation fallback (by using NBD) ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++# creator ++owner=mreitz@redhat.com ++ ++seq=$(basename $0) ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++ ++_supported_fmt raw ++_supported_proto nbd ++_supported_os Linux ++ ++ ++_make_test_img 64M ++ ++echo ++echo '--- Testing creation ---' ++ ++$QEMU_IMG create -f qcow2 "$TEST_IMG" 64M | _filter_img_create ++$QEMU_IMG info "$TEST_IMG" | _filter_img_info ++ ++echo ++echo '--- Testing creation for which the node would need to grow ---' ++ ++# NBD does not support resizing, so this will fail ++$QEMU_IMG create -f qcow2 -o preallocation=metadata "$TEST_IMG" 64M 2>&1 \ ++ | _filter_img_create ++ ++# success, all done ++echo "*** done" ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/259.out b/tests/qemu-iotests/259.out +new file mode 100644 +index 0000000..ffed19c +--- /dev/null ++++ b/tests/qemu-iotests/259.out +@@ -0,0 +1,14 @@ ++QA output created by 259 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++ ++--- Testing creation --- ++Formatting 'TEST_DIR/t.IMGFMT', fmt=qcow2 size=67108864 ++image: TEST_DIR/t.IMGFMT ++file format: qcow2 ++virtual size: 64 MiB (67108864 bytes) ++disk size: unavailable ++ ++--- Testing creation for which the node would need to grow --- ++qemu-img: TEST_DIR/t.IMGFMT: Could not resize image: Image format driver does not support resize ++Formatting 'TEST_DIR/t.IMGFMT', fmt=qcow2 size=67108864 preallocation=metadata ++*** done +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index c0e8197..e47cbfc 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -273,6 +273,7 @@ + 256 rw quick + 257 rw + 258 rw quick ++259 rw auto quick + 260 rw quick + 261 rw + 262 rw quick migration +-- +1.8.3.1 + diff --git a/kvm-iotests-Backup-with-different-source-target-size.patch b/kvm-iotests-Backup-with-different-source-target-size.patch new file mode 100755 index 0000000..4008413 --- /dev/null +++ b/kvm-iotests-Backup-with-different-source-target-size.patch @@ -0,0 +1,105 @@ +From 456c5e79c32e3f2f9319a7d1452fe523aded7835 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:21 +0100 +Subject: [PATCH 22/26] iotests: Backup with different source/target size + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-8-kwolf@redhat.com> +Patchwork-id: 97106 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 07/11] iotests: Backup with different source/target size +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +This tests that the backup job catches situations where the target node +has a different size than the source node. It must also forbid resize +operations when the job is already running. + +Signed-off-by: Kevin Wolf +Message-Id: <20200430142755.315494-5-kwolf@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 0a82a9273062d05764e3df3637b3aa95ad8291c6) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 42 ++++++++++++++++++++++++++++++++++++++++-- + tests/qemu-iotests/055.out | 4 ++-- + 2 files changed, 42 insertions(+), 4 deletions(-) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index c9cdc06..1c70389 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -48,8 +48,10 @@ class TestSingleDrive(iotests.QMPTestCase): + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, blockdev_target_img, str(image_len)) + +- self.vm = iotests.VM().add_drive('blkdebug::' + test_img) +- self.vm.add_drive(blockdev_target_img, interface="none") ++ self.vm = iotests.VM() ++ self.vm.add_drive('blkdebug::' + test_img, 'node-name=source') ++ self.vm.add_drive(blockdev_target_img, 'node-name=target', ++ interface="none") + if iotests.qemu_default_machine == 'pc': + self.vm.add_drive(None, 'media=cdrom', 'ide') + self.vm.launch() +@@ -112,6 +114,42 @@ class TestSingleDrive(iotests.QMPTestCase): + def test_pause_blockdev_backup(self): + self.do_test_pause('blockdev-backup', 'drive1', blockdev_target_img) + ++ def do_test_resize_blockdev_backup(self, device, node): ++ def pre_finalize(): ++ result = self.vm.qmp('block_resize', device=device, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp('block_resize', node_name=node, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp('blockdev-backup', job_id='job0', device='drive0', ++ target='drive1', sync='full', auto_finalize=False, ++ auto_dismiss=False) ++ self.assert_qmp(result, 'return', {}) ++ ++ self.vm.run_job('job0', auto_finalize=False, pre_finalize=pre_finalize, ++ use_log=False) ++ ++ def test_source_resize_blockdev_backup(self): ++ self.do_test_resize_blockdev_backup('drive0', 'source') ++ ++ def test_target_resize_blockdev_backup(self): ++ self.do_test_resize_blockdev_backup('drive1', 'target') ++ ++ def do_test_target_size(self, size): ++ result = self.vm.qmp('block_resize', device='drive1', size=size) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.qmp('blockdev-backup', job_id='job0', device='drive0', ++ target='drive1', sync='full') ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ def test_small_target(self): ++ self.do_test_target_size(image_len // 2) ++ ++ def test_large_target(self): ++ self.do_test_target_size(image_len * 2) ++ + def test_medium_not_found(self): + if iotests.qemu_default_machine != 'pc': + return +diff --git a/tests/qemu-iotests/055.out b/tests/qemu-iotests/055.out +index 5c26d15..0a5e958 100644 +--- a/tests/qemu-iotests/055.out ++++ b/tests/qemu-iotests/055.out +@@ -1,5 +1,5 @@ +-.................................... ++........................................ + ---------------------------------------------------------------------- +-Ran 36 tests ++Ran 40 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Create-VM.blockdev_create.patch b/kvm-iotests-Create-VM.blockdev_create.patch new file mode 100755 index 0000000..805b31a --- /dev/null +++ b/kvm-iotests-Create-VM.blockdev_create.patch @@ -0,0 +1,59 @@ +From 05fedde1374abb180cd2b51457385d8128aa7fe4 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:00 +0000 +Subject: [PATCH 03/18] iotests: Create VM.blockdev_create() + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-3-kwolf@redhat.com> +Patchwork-id: 93748 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 2/6] iotests: Create VM.blockdev_create() +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +We have several almost identical copies of a blockdev_create() function +in different test cases. Time to create one unified function in +iotests.py. + +To keep the diff managable, this patch only creates the function and +follow-up patches will convert the individual test cases. + +Signed-off-by: Kevin Wolf +(cherry picked from commit e9dbd1cae86f7cb6f8e470e1485aeb0c6e23ae64) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 3cff671..5741efb 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -638,6 +638,22 @@ class VM(qtest.QEMUQtestMachine): + elif status == 'null': + return error + ++ # Returns None on success, and an error string on failure ++ def blockdev_create(self, options, job_id='job0', filters=None): ++ if filters is None: ++ filters = [filter_qmp_testfiles] ++ result = self.qmp_log('blockdev-create', filters=filters, ++ job_id=job_id, options=options) ++ ++ if 'return' in result: ++ assert result['return'] == {} ++ job_result = self.run_job(job_id) ++ else: ++ job_result = result['error'] ++ ++ log("") ++ return job_result ++ + def enable_migration_events(self, name): + log('Enabling migration QMP events on %s...' % name) + log(self.qmp('migrate-set-capabilities', capabilities=[ +-- +1.8.3.1 + diff --git a/kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch b/kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch new file mode 100755 index 0000000..60c08ec --- /dev/null +++ b/kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch @@ -0,0 +1,52 @@ +From 8dc8a17d4e98aae41db01cbc073e69de44291b63 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:38 +0100 +Subject: [PATCH 10/17] iotests: Filter testfiles out in filter_img_info() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-10-kwolf@redhat.com> +Patchwork-id: 97455 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 09/11] iotests: Filter testfiles out in filter_img_info() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +We want to keep TEST_IMG for the full path of the main test image, but +filter_testfiles() must be called for other test images before replacing +other things like the image format because the test directory path could +contain the format as a substring. + +Insert a filter_testfiles() call between both. + +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200424125448.63318-9-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit fd586ce8bee50d98773436214dc9e644ddda54aa) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 7a9c779..cd5df36 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -335,8 +335,9 @@ def filter_img_info(output, filename): + for line in output.split('\n'): + if 'disk size' in line or 'actual-size' in line: + continue +- line = line.replace(filename, 'TEST_IMG') \ +- .replace(imgfmt, 'IMGFMT') ++ line = line.replace(filename, 'TEST_IMG') ++ line = filter_testfiles(line) ++ line = line.replace(imgfmt, 'IMGFMT') + line = re.sub('iters: [0-9]+', 'iters: XXX', line) + line = re.sub('uuid: [-a-f0-9]+', 'uuid: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX', line) + line = re.sub('cid: [0-9]+', 'cid: XXXXXXXXXX', line) +-- +1.8.3.1 + diff --git a/kvm-iotests-Fix-run_job-with-use_log-False.patch b/kvm-iotests-Fix-run_job-with-use_log-False.patch new file mode 100755 index 0000000..b105fc2 --- /dev/null +++ b/kvm-iotests-Fix-run_job-with-use_log-False.patch @@ -0,0 +1,47 @@ +From bb7b968a02c97564596b73d8d080cd745d96ed6b Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:35 +0000 +Subject: [PATCH 15/20] iotests: Fix run_job() with use_log=False + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-10-kwolf@redhat.com> +Patchwork-id: 94284 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 09/13] iotests: Fix run_job() with use_log=False +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +The 'job-complete' QMP command should be run with qmp() rather than +qmp_log() if use_log=False is passed. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-4-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit b31b532122ec6f68d17168449c034d2197bf96ec) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 0c55f7b..46f880c 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -618,7 +618,10 @@ class VM(qtest.QEMUQtestMachine): + if use_log: + log('Job failed: %s' % (j['error'])) + elif status == 'ready': +- self.qmp_log('job-complete', id=job) ++ if use_log: ++ self.qmp_log('job-complete', id=job) ++ else: ++ self.qmp('job-complete', id=job) + elif status == 'pending' and not auto_finalize: + if pre_finalize: + pre_finalize() +-- +1.8.3.1 + diff --git a/kvm-iotests-Fix-test-178.patch b/kvm-iotests-Fix-test-178.patch new file mode 100755 index 0000000..5e54daa --- /dev/null +++ b/kvm-iotests-Fix-test-178.patch @@ -0,0 +1,59 @@ +From a04d324e41a40a6893bc94109994afc017f17192 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:16 +0100 +Subject: [PATCH 11/26] iotests: Fix test 178 + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-9-eblake@redhat.com> +Patchwork-id: 97075 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 08/12] iotests: Fix test 178 +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +A recent change to qemu-img changed expected error message output, but +178 takes long enough to execute that it does not get run by 'make +check' or './check -g quick'. + +Fixes: 43d589b074 +Signed-off-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200521192137.1120211-2-eblake@redhat.com> +(cherry picked from commit ca01b7a641527052e3e8961845b40b81706ce5f9) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/178.out.qcow2 | 2 +- + tests/qemu-iotests/178.out.raw | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2 +index 9e7d8c4..345eab3 100644 +--- a/tests/qemu-iotests/178.out.qcow2 ++++ b/tests/qemu-iotests/178.out.qcow2 +@@ -13,7 +13,7 @@ qemu-img: Invalid option list: , + qemu-img: Invalid parameter 'snapshot.foo' + qemu-img: Failed in parsing snapshot param 'snapshot.foo' + qemu-img: --output must be used with human or json as argument. +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + qemu-img: Unknown file format 'foo' + + == Size calculation for a new file (human) == +diff --git a/tests/qemu-iotests/178.out.raw b/tests/qemu-iotests/178.out.raw +index 6478365..15da915 100644 +--- a/tests/qemu-iotests/178.out.raw ++++ b/tests/qemu-iotests/178.out.raw +@@ -13,7 +13,7 @@ qemu-img: Invalid option list: , + qemu-img: Invalid parameter 'snapshot.foo' + qemu-img: Failed in parsing snapshot param 'snapshot.foo' + qemu-img: --output must be used with human or json as argument. +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + qemu-img: Unknown file format 'foo' + + == Size calculation for a new file (human) == +-- +1.8.3.1 + diff --git a/kvm-iotests-Let-_make_test_img-parse-its-parameters.patch b/kvm-iotests-Let-_make_test_img-parse-its-parameters.patch new file mode 100755 index 0000000..d24f5e7 --- /dev/null +++ b/kvm-iotests-Let-_make_test_img-parse-its-parameters.patch @@ -0,0 +1,91 @@ +From 3c96dbd74fb67e2ae1a116b2771290b192041707 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:10 +0100 +Subject: [PATCH 05/26] iotests: Let _make_test_img parse its parameters + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-3-eblake@redhat.com> +Patchwork-id: 97070 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 02/12] iotests: Let _make_test_img parse its parameters +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +From: Max Reitz + +This will allow us to add more options than just -b. + +Signed-off-by: Max Reitz +Reviewed-by: Maxim Levitsky +Message-id: 20191107163708.833192-9-mreitz@redhat.com +Signed-off-by: Max Reitz +(cherry picked from commit eea871d047701b563cfd66c1566b9ff6d163882b) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/common.rc | 28 ++++++++++++++++++++-------- + 1 file changed, 20 insertions(+), 8 deletions(-) + +diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc +index 0cc8acc..99fef4d 100644 +--- a/tests/qemu-iotests/common.rc ++++ b/tests/qemu-iotests/common.rc +@@ -302,12 +302,12 @@ _make_test_img() + # extra qemu-img options can be added by tests + # at least one argument (the image size) needs to be added + local extra_img_options="" +- local image_size=$* + local optstr="" + local img_name="" + local use_backing=0 + local backing_file="" + local object_options="" ++ local misc_params=() + + if [ -n "$TEST_IMG_FILE" ]; then + img_name=$TEST_IMG_FILE +@@ -323,11 +323,23 @@ _make_test_img() + optstr=$(_optstr_add "$optstr" "key-secret=keysec0") + fi + +- if [ "$1" = "-b" ]; then +- use_backing=1 +- backing_file=$2 +- image_size=$3 +- fi ++ for param; do ++ if [ "$use_backing" = "1" -a -z "$backing_file" ]; then ++ backing_file=$param ++ continue ++ fi ++ ++ case "$param" in ++ -b) ++ use_backing=1 ++ ;; ++ ++ *) ++ misc_params=("${misc_params[@]}" "$param") ++ ;; ++ esac ++ done ++ + if [ \( "$IMGFMT" = "qcow2" -o "$IMGFMT" = "qed" \) -a -n "$CLUSTER_SIZE" ]; then + optstr=$(_optstr_add "$optstr" "cluster_size=$CLUSTER_SIZE") + fi +@@ -343,9 +355,9 @@ _make_test_img() + # XXX(hch): have global image options? + ( + if [ $use_backing = 1 ]; then +- $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options -b "$backing_file" "$img_name" $image_size 2>&1 ++ $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options -b "$backing_file" "$img_name" "${misc_params[@]}" 2>&1 + else +- $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options "$img_name" $image_size 2>&1 ++ $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options "$img_name" "${misc_params[@]}" 2>&1 + fi + ) | _filter_img_create + +-- +1.8.3.1 + diff --git a/kvm-iotests-Mirror-with-different-source-target-size.patch b/kvm-iotests-Mirror-with-different-source-target-size.patch new file mode 100755 index 0000000..7757632 --- /dev/null +++ b/kvm-iotests-Mirror-with-different-source-target-size.patch @@ -0,0 +1,110 @@ +From aff543186ff316d66b2c7acb434c6c17bdb8da78 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:25 +0100 +Subject: [PATCH 26/26] iotests: Mirror with different source/target size + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-12-kwolf@redhat.com> +Patchwork-id: 97109 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 11/11] iotests: Mirror with different source/target size +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +This tests that the mirror job catches situations where the target node +has a different size than the source node. It must also forbid resize +operations when the job is already running. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200511135825.219437-5-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 16cea4ee1c8e5a69a058e76f426b2e17974d8d7d) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/041 | 45 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/041.out | 4 ++-- + 2 files changed, 47 insertions(+), 2 deletions(-) + +diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 +index a543b15..20fb68a 100755 +--- a/tests/qemu-iotests/041 ++++ b/tests/qemu-iotests/041 +@@ -240,6 +240,49 @@ class TestSingleBlockdev(TestSingleDrive): + target=self.qmp_target) + self.assert_qmp(result, 'error/class', 'GenericError') + ++ def do_test_resize(self, device, node): ++ def pre_finalize(): ++ if device: ++ result = self.vm.qmp('block_resize', device=device, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp('block_resize', node_name=node, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp(self.qmp_cmd, job_id='job0', device='drive0', ++ sync='full', target=self.qmp_target, ++ auto_finalize=False, auto_dismiss=False) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.run_job('job0', auto_finalize=False, ++ pre_finalize=pre_finalize, use_log=False) ++ self.assertEqual(result, None) ++ ++ def test_source_resize(self): ++ self.do_test_resize('drive0', 'top') ++ ++ def test_target_resize(self): ++ self.do_test_resize(None, self.qmp_target) ++ ++ def do_test_target_size(self, size): ++ result = self.vm.qmp('block_resize', node_name=self.qmp_target, ++ size=size) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.qmp(self.qmp_cmd, job_id='job0', ++ device='drive0', sync='full', auto_dismiss=False, ++ target=self.qmp_target) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.run_job('job0', use_log=False) ++ self.assertEqual(result, 'Source and target image have different sizes') ++ ++ def test_small_target(self): ++ self.do_test_target_size(self.image_len // 2) ++ ++ def test_large_target(self): ++ self.do_test_target_size(self.image_len * 2) ++ + test_large_cluster = None + test_image_not_found = None + test_small_buffer2 = None +@@ -251,6 +294,8 @@ class TestSingleDriveZeroLength(TestSingleDrive): + + class TestSingleBlockdevZeroLength(TestSingleBlockdev): + image_len = 0 ++ test_small_target = None ++ test_large_target = None + + class TestSingleDriveUnalignedLength(TestSingleDrive): + image_len = 1025 * 1024 +diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out +index 2c448b4..3ea6aa4 100644 +--- a/tests/qemu-iotests/041.out ++++ b/tests/qemu-iotests/041.out +@@ -1,5 +1,5 @@ +-.......................................................................................... ++.................................................................................................... + ---------------------------------------------------------------------- +-Ran 90 tests ++Ran 100 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch b/kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch new file mode 100755 index 0000000..17e4a41 --- /dev/null +++ b/kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch @@ -0,0 +1,122 @@ +From 7e23b64dc20b64ca6fa887cd06cc5e52374f6268 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:30 +0000 +Subject: [PATCH 10/20] iotests: Refactor blockdev-reopen test for iothreads + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-5-kwolf@redhat.com> +Patchwork-id: 94281 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 04/13] iotests: Refactor blockdev-reopen test for iothreads +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +We'll want to test more than one successful case in the future, so +prepare the test for that by a refactoring that runs each scenario in a +separate VM. + +test_iothreads_switch_{backing,overlay} currently produce errors, but +these are cases that should actually work, by switching either the +backing file node or the overlay node to the AioContext of the other +node. + +Signed-off-by: Kevin Wolf +Tested-by: Peter Krempa +Message-Id: <20200306141413.30705-2-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 97518e11c3d902a32386d33797044f6b79bccc6f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/245 | 47 ++++++++++++++++++++++++++++++++++++---------- + tests/qemu-iotests/245.out | 4 ++-- + 2 files changed, 39 insertions(+), 12 deletions(-) + +diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 +index e66a23c..f69c2fa 100644 +--- a/tests/qemu-iotests/245 ++++ b/tests/qemu-iotests/245 +@@ -968,8 +968,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): + self.assertEqual(self.get_node('hd1'), None) + self.assert_qmp(self.get_node('hd2'), 'ro', True) + +- # We don't allow setting a backing file that uses a different AioContext +- def test_iothreads(self): ++ def run_test_iothreads(self, iothread_a, iothread_b, errmsg = None): + opts = hd_opts(0) + result = self.vm.qmp('blockdev-add', conv_keys = False, **opts) + self.assert_qmp(result, 'return', {}) +@@ -984,20 +983,48 @@ class TestBlockdevReopen(iotests.QMPTestCase): + result = self.vm.qmp('object-add', qom_type='iothread', id='iothread1') + self.assert_qmp(result, 'return', {}) + +- result = self.vm.qmp('x-blockdev-set-iothread', node_name='hd0', iothread='iothread0') ++ result = self.vm.qmp('device_add', driver='virtio-scsi', id='scsi0', ++ iothread=iothread_a) + self.assert_qmp(result, 'return', {}) + +- self.reopen(opts, {'backing': 'hd2'}, "Cannot use a new backing file with a different AioContext") +- +- result = self.vm.qmp('x-blockdev-set-iothread', node_name='hd2', iothread='iothread1') ++ result = self.vm.qmp('device_add', driver='virtio-scsi', id='scsi1', ++ iothread=iothread_b) + self.assert_qmp(result, 'return', {}) + +- self.reopen(opts, {'backing': 'hd2'}, "Cannot use a new backing file with a different AioContext") ++ if iothread_a: ++ result = self.vm.qmp('device_add', driver='scsi-hd', drive='hd0', ++ share_rw=True, bus="scsi0.0") ++ self.assert_qmp(result, 'return', {}) + +- result = self.vm.qmp('x-blockdev-set-iothread', node_name='hd2', iothread='iothread0') +- self.assert_qmp(result, 'return', {}) ++ if iothread_b: ++ result = self.vm.qmp('device_add', driver='scsi-hd', drive='hd2', ++ share_rw=True, bus="scsi1.0") ++ self.assert_qmp(result, 'return', {}) + +- self.reopen(opts, {'backing': 'hd2'}) ++ # Attaching the backing file may or may not work ++ self.reopen(opts, {'backing': 'hd2'}, errmsg) ++ ++ # But removing the backing file should always work ++ self.reopen(opts, {'backing': None}) ++ ++ self.vm.shutdown() ++ ++ # We don't allow setting a backing file that uses a different AioContext if ++ # neither of them can switch to the other AioContext ++ def test_iothreads_error(self): ++ self.run_test_iothreads('iothread0', 'iothread1', ++ "Cannot use a new backing file with a different AioContext") ++ ++ def test_iothreads_compatible_users(self): ++ self.run_test_iothreads('iothread0', 'iothread0') ++ ++ def test_iothreads_switch_backing(self): ++ self.run_test_iothreads('iothread0', None, ++ "Cannot use a new backing file with a different AioContext") ++ ++ def test_iothreads_switch_overlay(self): ++ self.run_test_iothreads(None, 'iothread0', ++ "Cannot use a new backing file with a different AioContext") + + if __name__ == '__main__': + iotests.main(supported_fmts=["qcow2"], +diff --git a/tests/qemu-iotests/245.out b/tests/qemu-iotests/245.out +index a19de52..682b933 100644 +--- a/tests/qemu-iotests/245.out ++++ b/tests/qemu-iotests/245.out +@@ -1,6 +1,6 @@ +-.................. ++..................... + ---------------------------------------------------------------------- +-Ran 18 tests ++Ran 21 tests + + OK + {"execute": "job-finalize", "arguments": {"id": "commit0"}} +-- +1.8.3.1 + diff --git a/kvm-iotests-Support-job-complete-in-run_job.patch b/kvm-iotests-Support-job-complete-in-run_job.patch new file mode 100755 index 0000000..08971a0 --- /dev/null +++ b/kvm-iotests-Support-job-complete-in-run_job.patch @@ -0,0 +1,46 @@ +From a3778aef0be61dead835af39073a62bbf72c8e20 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:23:59 +0000 +Subject: [PATCH 02/18] iotests: Support job-complete in run_job() + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-2-kwolf@redhat.com> +Patchwork-id: 93746 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 1/6] iotests: Support job-complete in run_job() +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Automatically complete jobs that have a 'ready' state and need an +explicit job-complete. Without this, run_job() would hang for such +jobs. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 4688c4e32ec76004676470f11734478799673d6d) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index df07089..3cff671 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -617,6 +617,8 @@ class VM(qtest.QEMUQtestMachine): + error = j['error'] + if use_log: + log('Job failed: %s' % (j['error'])) ++ elif status == 'ready': ++ self.qmp_log('job-complete', id=job) + elif status == 'pending' and not auto_finalize: + if pre_finalize: + pre_finalize() +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-committing-to-short-backing-file.patch b/kvm-iotests-Test-committing-to-short-backing-file.patch new file mode 100755 index 0000000..fbbaac6 --- /dev/null +++ b/kvm-iotests-Test-committing-to-short-backing-file.patch @@ -0,0 +1,480 @@ +From e2a1b3fd32be8bb730656a6f22eb4f543b120c9d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:39 +0100 +Subject: [PATCH 11/17] iotests: Test committing to short backing file + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-11-kwolf@redhat.com> +Patchwork-id: 97453 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 10/11] iotests: Test committing to short backing file +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Signed-off-by: Kevin Wolf +Message-Id: <20200424125448.63318-10-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit bf03dede475e29a16f9188ea85a4d77cd3dcf2b7) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/274 | 155 ++++++++++++++++++++++++++ + tests/qemu-iotests/274.out | 268 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 424 insertions(+) + create mode 100755 tests/qemu-iotests/274 + create mode 100644 tests/qemu-iotests/274.out + +diff --git a/tests/qemu-iotests/274 b/tests/qemu-iotests/274 +new file mode 100755 +index 0000000..e951f72 +--- /dev/null ++++ b/tests/qemu-iotests/274 +@@ -0,0 +1,155 @@ ++#!/usr/bin/env python3 ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++# Creator/Owner: Kevin Wolf ++# ++# Some tests for short backing files and short overlays ++ ++import iotests ++ ++iotests.verify_image_format(supported_fmts=['qcow2']) ++iotests.verify_platform(['linux']) ++ ++size_short = 1 * 1024 * 1024 ++size_long = 2 * 1024 * 1024 ++size_diff = size_long - size_short ++ ++def create_chain() -> None: ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, base, ++ str(size_long)) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, mid, ++ str(size_short)) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', mid, top, ++ str(size_long)) ++ ++ iotests.qemu_io_log('-c', 'write -P 1 0 %d' % size_long, base) ++ ++def create_vm() -> iotests.VM: ++ vm = iotests.VM() ++ vm.add_blockdev('file,filename=%s,node-name=base-file' % base) ++ vm.add_blockdev('%s,file=base-file,node-name=base' % iotests.imgfmt) ++ vm.add_blockdev('file,filename=%s,node-name=mid-file' % mid) ++ vm.add_blockdev('%s,file=mid-file,node-name=mid,backing=base' ++ % iotests.imgfmt) ++ vm.add_drive(top, 'backing=mid,node-name=top') ++ return vm ++ ++with iotests.FilePath('base') as base, \ ++ iotests.FilePath('mid') as mid, \ ++ iotests.FilePath('top') as top: ++ ++ iotests.log('== Commit tests ==') ++ ++ create_chain() ++ ++ iotests.log('=== Check visible data ===') ++ ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, top) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), top) ++ ++ iotests.log('=== Checking allocation status ===') ++ ++ iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short, ++ '-c', 'alloc %d %d' % (size_short, size_diff), ++ base) ++ ++ iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short, ++ '-c', 'alloc %d %d' % (size_short, size_diff), ++ mid) ++ ++ iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short, ++ '-c', 'alloc %d %d' % (size_short, size_diff), ++ top) ++ ++ iotests.log('=== Checking map ===') ++ ++ iotests.qemu_img_log('map', '--output=json', base) ++ iotests.qemu_img_log('map', '--output=human', base) ++ iotests.qemu_img_log('map', '--output=json', mid) ++ iotests.qemu_img_log('map', '--output=human', mid) ++ iotests.qemu_img_log('map', '--output=json', top) ++ iotests.qemu_img_log('map', '--output=human', top) ++ ++ iotests.log('=== Testing qemu-img commit (top -> mid) ===') ++ ++ iotests.qemu_img_log('commit', top) ++ iotests.img_info_log(mid) ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid) ++ ++ iotests.log('=== Testing HMP commit (top -> mid) ===') ++ ++ create_chain() ++ with create_vm() as vm: ++ vm.launch() ++ vm.qmp_log('human-monitor-command', command_line='commit drive0') ++ ++ iotests.img_info_log(mid) ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid) ++ ++ iotests.log('=== Testing QMP active commit (top -> mid) ===') ++ ++ create_chain() ++ with create_vm() as vm: ++ vm.launch() ++ vm.qmp_log('block-commit', device='top', base_node='mid', ++ job_id='job0', auto_dismiss=False) ++ vm.run_job('job0', wait=5) ++ ++ iotests.img_info_log(mid) ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid) ++ ++ ++ iotests.log('== Resize tests ==') ++ ++ # Use different sizes for different allocation modes: ++ # ++ # We want to have at least one test where 32 bit truncation in the size of ++ # the overlapping area becomes visible. This is covered by the ++ # prealloc='off' case (1G to 6G is an overlap of 5G). ++ # ++ # However, we can only do this for modes that don't preallocate data ++ # because otherwise we might run out of space on the test host. ++ # ++ # We also want to test some unaligned combinations. ++ for (prealloc, base_size, top_size_old, top_size_new, off) in [ ++ ('off', '6G', '1G', '8G', '5G'), ++ ('metadata', '32G', '30G', '33G', '31G'), ++ ('falloc', '10M', '5M', '15M', '9M'), ++ ('full', '16M', '8M', '12M', '11M'), ++ ('off', '384k', '253k', '512k', '253k'), ++ ('off', '400k', '256k', '512k', '336k'), ++ ('off', '512k', '256k', '500k', '436k')]: ++ ++ iotests.log('=== preallocation=%s ===' % prealloc) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, base, base_size) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, top, ++ top_size_old) ++ iotests.qemu_io_log('-c', 'write -P 1 %s 64k' % off, base) ++ ++ # After this, top_size_old to base_size should be allocated/zeroed. ++ # ++ # In theory, leaving base_size to top_size_new unallocated would be ++ # correct, but in practice, if we zero out anything, we zero out ++ # everything up to top_size_new. ++ iotests.qemu_img_log('resize', '-f', iotests.imgfmt, ++ '--preallocation', prealloc, top, top_size_new) ++ iotests.qemu_io_log('-c', 'read -P 0 %s 64k' % off, top) ++ iotests.qemu_io_log('-c', 'map', top) ++ iotests.qemu_img_log('map', '--output=json', top) +diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out +new file mode 100644 +index 0000000..1a796fd +--- /dev/null ++++ b/tests/qemu-iotests/274.out +@@ -0,0 +1,268 @@ ++== Commit tests == ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 2097152/2097152 bytes at offset 0 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Check visible data === ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Checking allocation status === ++1048576/1048576 bytes allocated at offset 0 bytes ++1048576/1048576 bytes allocated at offset 1 MiB ++ ++0/1048576 bytes allocated at offset 0 bytes ++0/0 bytes allocated at offset 1 MiB ++ ++0/1048576 bytes allocated at offset 0 bytes ++0/1048576 bytes allocated at offset 1 MiB ++ ++=== Checking map === ++[{ "start": 0, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": 327680}] ++ ++Offset Length Mapped to File ++0 0x200000 0x50000 TEST_DIR/PID-base ++ ++[{ "start": 0, "length": 1048576, "depth": 1, "zero": false, "data": true, "offset": 327680}] ++ ++Offset Length Mapped to File ++0 0x100000 0x50000 TEST_DIR/PID-base ++ ++[{ "start": 0, "length": 1048576, "depth": 2, "zero": false, "data": true, "offset": 327680}, ++{ "start": 1048576, "length": 1048576, "depth": 0, "zero": true, "data": false}] ++ ++Offset Length Mapped to File ++0 0x100000 0x50000 TEST_DIR/PID-base ++ ++=== Testing qemu-img commit (top -> mid) === ++Image committed. ++ ++image: TEST_IMG ++file format: IMGFMT ++virtual size: 2 MiB (2097152 bytes) ++cluster_size: 65536 ++backing file: TEST_DIR/PID-base ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ refcount bits: 16 ++ corrupt: false ++ ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Testing HMP commit (top -> mid) === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 2097152/2097152 bytes at offset 0 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++{"execute": "human-monitor-command", "arguments": {"command-line": "commit drive0"}} ++{"return": ""} ++image: TEST_IMG ++file format: IMGFMT ++virtual size: 2 MiB (2097152 bytes) ++cluster_size: 65536 ++backing file: TEST_DIR/PID-base ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ refcount bits: 16 ++ corrupt: false ++ ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Testing QMP active commit (top -> mid) === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 2097152/2097152 bytes at offset 0 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++{"execute": "block-commit", "arguments": {"auto-dismiss": false, "base-node": "mid", "device": "top", "job-id": "job0"}} ++{"return": {}} ++{"execute": "job-complete", "arguments": {"id": "job0"}} ++{"return": {}} ++{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} ++image: TEST_IMG ++file format: IMGFMT ++virtual size: 2 MiB (2097152 bytes) ++cluster_size: 65536 ++backing file: TEST_DIR/PID-base ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ refcount bits: 16 ++ corrupt: false ++ ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== Resize tests == ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=6442450944 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=1073741824 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 5368709120 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 5368709120 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++1 GiB (0x40000000) bytes not allocated at offset 0 bytes (0x0) ++7 GiB (0x1c0000000) bytes allocated at offset 1 GiB (0x40000000) ++ ++[{ "start": 0, "length": 1073741824, "depth": 1, "zero": true, "data": false}, ++{ "start": 1073741824, "length": 7516192768, "depth": 0, "zero": true, "data": false}] ++ ++=== preallocation=metadata === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=34359738368 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=32212254720 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 33285996544 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 33285996544 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++30 GiB (0x780000000) bytes not allocated at offset 0 bytes (0x0) ++3 GiB (0xc0000000) bytes allocated at offset 30 GiB (0x780000000) ++ ++[{ "start": 0, "length": 32212254720, "depth": 1, "zero": true, "data": false}, ++{ "start": 32212254720, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 327680}, ++{ "start": 32749125632, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 537264128}, ++{ "start": 33285996544, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1074200576}, ++{ "start": 33822867456, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1611137024}, ++{ "start": 34359738368, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2148139008}, ++{ "start": 34896609280, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2685075456}] ++ ++=== preallocation=falloc === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=10485760 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=5242880 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 9437184 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 9437184 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++5 MiB (0x500000) bytes not allocated at offset 0 bytes (0x0) ++10 MiB (0xa00000) bytes allocated at offset 5 MiB (0x500000) ++ ++[{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false}, ++{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++ ++=== preallocation=full === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=8388608 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 11534336 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 11534336 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++8 MiB (0x800000) bytes not allocated at offset 0 bytes (0x0) ++4 MiB (0x400000) bytes allocated at offset 8 MiB (0x800000) ++ ++[{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false}, ++{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++ ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=259072 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 259072 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 259072 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++192 KiB (0x30000) bytes not allocated at offset 0 bytes (0x0) ++320 KiB (0x50000) bytes allocated at offset 192 KiB (0x30000) ++ ++[{ "start": 0, "length": 196608, "depth": 1, "zero": true, "data": false}, ++{ "start": 196608, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, ++{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}] ++ ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=409600 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 344064 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 344064 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0) ++256 KiB (0x40000) bytes allocated at offset 256 KiB (0x40000) ++ ++[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false}, ++{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}] ++ ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=524288 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 446464 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 446464 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0) ++244 KiB (0x3d000) bytes allocated at offset 256 KiB (0x40000) ++ ++[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false}, ++{ "start": 262144, "length": 249856, "depth": 0, "zero": true, "data": false}] ++ +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 033b54d..cddae00 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -286,6 +286,7 @@ + 270 rw backing quick + 272 rw + 273 backing quick ++274 rw backing + 277 rw quick + 280 rw migration quick + 281 rw quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-external-snapshot-with-VM-state.patch b/kvm-iotests-Test-external-snapshot-with-VM-state.patch new file mode 100755 index 0000000..6fcb2f6 --- /dev/null +++ b/kvm-iotests-Test-external-snapshot-with-VM-state.patch @@ -0,0 +1,189 @@ +From 38b0cff9703fc740c30f5874973ac1be88f94d9f Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:03 +0000 +Subject: [PATCH 06/18] iotests: Test external snapshot with VM state + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-6-kwolf@redhat.com> +Patchwork-id: 93752 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 5/6] iotests: Test external snapshot with VM state +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +This tests creating an external snapshot with VM state (which results in +an active overlay over an inactive backing file, which is also the root +node of an inactive BlockBackend), re-activating the images and +performing some operations to test that the re-activation worked as +intended. + +Signed-off-by: Kevin Wolf +(cherry picked from commit f62f08ab7a9d902da70078992248ec5c98f652ad) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/280 | 83 ++++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/280.out | 50 ++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 134 insertions(+) + create mode 100755 tests/qemu-iotests/280 + create mode 100644 tests/qemu-iotests/280.out + +diff --git a/tests/qemu-iotests/280 b/tests/qemu-iotests/280 +new file mode 100755 +index 0000000..0b1fa8e +--- /dev/null ++++ b/tests/qemu-iotests/280 +@@ -0,0 +1,83 @@ ++#!/usr/bin/env python ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++# Creator/Owner: Kevin Wolf ++# ++# Test migration to file for taking an external snapshot with VM state. ++ ++import iotests ++import os ++ ++iotests.verify_image_format(supported_fmts=['qcow2']) ++iotests.verify_protocol(supported=['file']) ++iotests.verify_platform(['linux']) ++ ++with iotests.FilePath('base') as base_path , \ ++ iotests.FilePath('top') as top_path, \ ++ iotests.VM() as vm: ++ ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, base_path, '64M') ++ ++ iotests.log('=== Launch VM ===') ++ vm.add_object('iothread,id=iothread0') ++ vm.add_blockdev('file,filename=%s,node-name=base-file' % (base_path)) ++ vm.add_blockdev('%s,file=base-file,node-name=base-fmt' % (iotests.imgfmt)) ++ vm.add_device('virtio-blk,drive=base-fmt,iothread=iothread0,id=vda') ++ vm.launch() ++ ++ vm.enable_migration_events('VM') ++ ++ iotests.log('\n=== Migrate to file ===') ++ vm.qmp_log('migrate', uri='exec:cat > /dev/null') ++ ++ with iotests.Timeout(3, 'Migration does not complete'): ++ vm.wait_migration() ++ ++ iotests.log('\nVM is now stopped:') ++ iotests.log(vm.qmp('query-migrate')['return']['status']) ++ vm.qmp_log('query-status') ++ ++ iotests.log('\n=== Create a snapshot of the disk image ===') ++ vm.blockdev_create({ ++ 'driver': 'file', ++ 'filename': top_path, ++ 'size': 0, ++ }) ++ vm.qmp_log('blockdev-add', node_name='top-file', ++ driver='file', filename=top_path, ++ filters=[iotests.filter_qmp_testfiles]) ++ ++ vm.blockdev_create({ ++ 'driver': iotests.imgfmt, ++ 'file': 'top-file', ++ 'size': 1024 * 1024, ++ }) ++ vm.qmp_log('blockdev-add', node_name='top-fmt', ++ driver=iotests.imgfmt, file='top-file') ++ ++ vm.qmp_log('blockdev-snapshot', node='base-fmt', overlay='top-fmt') ++ ++ iotests.log('\n=== Resume the VM and simulate a write request ===') ++ vm.qmp_log('cont') ++ iotests.log(vm.hmp_qemu_io('-d vda/virtio-backend', 'write 4k 4k')) ++ ++ iotests.log('\n=== Commit it to the backing file ===') ++ result = vm.qmp_log('block-commit', job_id='job0', auto_dismiss=False, ++ device='top-fmt', top_node='top-fmt', ++ filters=[iotests.filter_qmp_testfiles]) ++ if 'return' in result: ++ vm.run_job('job0') +diff --git a/tests/qemu-iotests/280.out b/tests/qemu-iotests/280.out +new file mode 100644 +index 0000000..5d382fa +--- /dev/null ++++ b/tests/qemu-iotests/280.out +@@ -0,0 +1,50 @@ ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=67108864 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++=== Launch VM === ++Enabling migration QMP events on VM... ++{"return": {}} ++ ++=== Migrate to file === ++{"execute": "migrate", "arguments": {"uri": "exec:cat > /dev/null"}} ++{"return": {}} ++{"data": {"status": "setup"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"status": "active"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"status": "completed"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++ ++VM is now stopped: ++completed ++{"execute": "query-status", "arguments": {}} ++{"return": {"running": false, "singlestep": false, "status": "postmigrate"}} ++ ++=== Create a snapshot of the disk image === ++{"execute": "blockdev-create", "arguments": {"job-id": "job0", "options": {"driver": "file", "filename": "TEST_DIR/PID-top", "size": 0}}} ++{"return": {}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} ++ ++{"execute": "blockdev-add", "arguments": {"driver": "file", "filename": "TEST_DIR/PID-top", "node-name": "top-file"}} ++{"return": {}} ++{"execute": "blockdev-create", "arguments": {"job-id": "job0", "options": {"driver": "qcow2", "file": "top-file", "size": 1048576}}} ++{"return": {}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} ++ ++{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": "top-file", "node-name": "top-fmt"}} ++{"return": {}} ++{"execute": "blockdev-snapshot", "arguments": {"node": "base-fmt", "overlay": "top-fmt"}} ++{"return": {}} ++ ++=== Resume the VM and simulate a write request === ++{"execute": "cont", "arguments": {}} ++{"return": {}} ++{"return": ""} ++ ++=== Commit it to the backing file === ++{"execute": "block-commit", "arguments": {"auto-dismiss": false, "device": "top-fmt", "job-id": "job0", "top-node": "top-fmt"}} ++{"return": {}} ++{"execute": "job-complete", "arguments": {"id": "job0"}} ++{"return": {}} ++{"data": {"device": "job0", "len": 65536, "offset": 65536, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"device": "job0", "len": 65536, "offset": 65536, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 06cc734..01301cd 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -286,3 +286,4 @@ + 272 rw + 273 backing quick + 277 rw quick ++280 rw migration quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch b/kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch new file mode 100755 index 0000000..b09439b --- /dev/null +++ b/kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch @@ -0,0 +1,322 @@ +From 6b9a6ba9ed753ad7aa714b35de938ebeeb4fa6cb Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 10:27:49 +0000 +Subject: [PATCH 16/18] iotests: Test handling of AioContexts with some + blockdev actions + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-10-slp@redhat.com> +Patchwork-id: 93762 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 9/9] iotests: Test handling of AioContexts with some blockdev actions +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Includes the following tests: + + - Adding a dirty bitmap. + * RHBZ: 1782175 + + - Starting a drive-mirror to an NBD-backed target. + * RHBZ: 1746217, 1773517 + + - Aborting an external snapshot transaction. + * RHBZ: 1779036 + + - Aborting a blockdev backup transaction. + * RHBZ: 1782111 + +For each one of them, a VM with a number of disks running in an +IOThread AioContext is used. + +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 9b8c59e7610b9c5315ef093d801843dbe8debfac) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/281 | 247 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/281.out | 5 + + tests/qemu-iotests/group | 1 + + 3 files changed, 253 insertions(+) + create mode 100755 tests/qemu-iotests/281 + create mode 100644 tests/qemu-iotests/281.out + +diff --git a/tests/qemu-iotests/281 b/tests/qemu-iotests/281 +new file mode 100755 +index 0000000..269d583 +--- /dev/null ++++ b/tests/qemu-iotests/281 +@@ -0,0 +1,247 @@ ++#!/usr/bin/env python ++# ++# Test cases for blockdev + IOThread interactions ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++import os ++import iotests ++from iotests import qemu_img ++ ++image_len = 64 * 1024 * 1024 ++ ++# Test for RHBZ#1782175 ++class TestDirtyBitmapIOThread(iotests.QMPTestCase): ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ images = { 'drive0': drive0_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm = iotests.VM() ++ self.vm.add_object('iothread,id=iothread0') ++ ++ for name in self.images: ++ self.vm.add_blockdev('driver=file,filename=%s,node-name=file_%s' ++ % (self.images[name], name)) ++ self.vm.add_blockdev('driver=qcow2,file=file_%s,node-name=%s' ++ % (name, name)) ++ ++ self.vm.launch() ++ self.vm.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ def tearDown(self): ++ self.vm.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_add_dirty_bitmap(self): ++ result = self.vm.qmp( ++ 'block-dirty-bitmap-add', ++ node='drive0', ++ name='bitmap1', ++ persistent=True, ++ ) ++ ++ self.assert_qmp(result, 'return', {}) ++ ++ ++# Test for RHBZ#1746217 & RHBZ#1773517 ++class TestNBDMirrorIOThread(iotests.QMPTestCase): ++ nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock') ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ mirror_img = os.path.join(iotests.test_dir, 'mirror.img') ++ images = { 'drive0': drive0_img, 'mirror': mirror_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm_src = iotests.VM(path_suffix='src') ++ self.vm_src.add_object('iothread,id=iothread0') ++ self.vm_src.add_blockdev('driver=file,filename=%s,node-name=file0' ++ % (self.drive0_img)) ++ self.vm_src.add_blockdev('driver=qcow2,file=file0,node-name=drive0') ++ self.vm_src.launch() ++ self.vm_src.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ self.vm_tgt = iotests.VM(path_suffix='tgt') ++ self.vm_tgt.add_object('iothread,id=iothread0') ++ self.vm_tgt.add_blockdev('driver=file,filename=%s,node-name=file0' ++ % (self.mirror_img)) ++ self.vm_tgt.add_blockdev('driver=qcow2,file=file0,node-name=drive0') ++ self.vm_tgt.launch() ++ self.vm_tgt.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ def tearDown(self): ++ self.vm_src.shutdown() ++ self.vm_tgt.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_nbd_mirror(self): ++ result = self.vm_tgt.qmp( ++ 'nbd-server-start', ++ addr={ ++ 'type': 'unix', ++ 'data': { 'path': self.nbd_sock } ++ } ++ ) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm_tgt.qmp( ++ 'nbd-server-add', ++ device='drive0', ++ writable=True ++ ) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm_src.qmp( ++ 'drive-mirror', ++ device='drive0', ++ target='nbd+unix:///drive0?socket=' + self.nbd_sock, ++ sync='full', ++ mode='existing', ++ speed=64*1024*1024, ++ job_id='j1' ++ ) ++ self.assert_qmp(result, 'return', {}) ++ ++ self.vm_src.event_wait(name="BLOCK_JOB_READY") ++ ++ ++# Test for RHBZ#1779036 ++class TestExternalSnapshotAbort(iotests.QMPTestCase): ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ snapshot_img = os.path.join(iotests.test_dir, 'snapshot.img') ++ images = { 'drive0': drive0_img, 'snapshot': snapshot_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm = iotests.VM() ++ self.vm.add_object('iothread,id=iothread0') ++ self.vm.add_blockdev('driver=file,filename=%s,node-name=file0' ++ % (self.drive0_img)) ++ self.vm.add_blockdev('driver=qcow2,file=file0,node-name=drive0') ++ self.vm.launch() ++ self.vm.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ def tearDown(self): ++ self.vm.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_external_snapshot_abort(self): ++ # Use a two actions transaction with a bogus values on the second ++ # one to trigger an abort of the transaction. ++ result = self.vm.qmp('transaction', actions=[ ++ { ++ 'type': 'blockdev-snapshot-sync', ++ 'data': { 'node-name': 'drive0', ++ 'snapshot-file': self.snapshot_img, ++ 'snapshot-node-name': 'snap1', ++ 'mode': 'absolute-paths', ++ 'format': 'qcow2' } ++ }, ++ { ++ 'type': 'blockdev-snapshot-sync', ++ 'data': { 'node-name': 'drive0', ++ 'snapshot-file': '/fakesnapshot', ++ 'snapshot-node-name': 'snap2', ++ 'mode': 'absolute-paths', ++ 'format': 'qcow2' } ++ }, ++ ]) ++ ++ # Crashes on failure, we expect this error. ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ ++# Test for RHBZ#1782111 ++class TestBlockdevBackupAbort(iotests.QMPTestCase): ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ drive1_img = os.path.join(iotests.test_dir, 'drive1.img') ++ snap0_img = os.path.join(iotests.test_dir, 'snap0.img') ++ snap1_img = os.path.join(iotests.test_dir, 'snap1.img') ++ images = { 'drive0': drive0_img, ++ 'drive1': drive1_img, ++ 'snap0': snap0_img, ++ 'snap1': snap1_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm = iotests.VM() ++ self.vm.add_object('iothread,id=iothread0') ++ self.vm.add_device('virtio-scsi,iothread=iothread0') ++ ++ for name in self.images: ++ self.vm.add_blockdev('driver=file,filename=%s,node-name=file_%s' ++ % (self.images[name], name)) ++ self.vm.add_blockdev('driver=qcow2,file=file_%s,node-name=%s' ++ % (name, name)) ++ ++ self.vm.add_device('scsi-hd,drive=drive0') ++ self.vm.add_device('scsi-hd,drive=drive1') ++ self.vm.launch() ++ ++ def tearDown(self): ++ self.vm.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_blockdev_backup_abort(self): ++ # Use a two actions transaction with a bogus values on the second ++ # one to trigger an abort of the transaction. ++ result = self.vm.qmp('transaction', actions=[ ++ { ++ 'type': 'blockdev-backup', ++ 'data': { 'device': 'drive0', ++ 'target': 'snap0', ++ 'sync': 'full', ++ 'job-id': 'j1' } ++ }, ++ { ++ 'type': 'blockdev-backup', ++ 'data': { 'device': 'drive1', ++ 'target': 'snap1', ++ 'sync': 'full' } ++ }, ++ ]) ++ ++ # Hangs on failure, we expect this error. ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++if __name__ == '__main__': ++ iotests.main(supported_fmts=['qcow2'], ++ supported_protocols=['file']) +diff --git a/tests/qemu-iotests/281.out b/tests/qemu-iotests/281.out +new file mode 100644 +index 0000000..89968f3 +--- /dev/null ++++ b/tests/qemu-iotests/281.out +@@ -0,0 +1,5 @@ ++.... ++---------------------------------------------------------------------- ++Ran 4 tests ++ ++OK +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 01301cd..c0e8197 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -287,3 +287,4 @@ + 273 backing quick + 277 rw quick + 280 rw migration quick ++281 rw quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch b/kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch new file mode 100755 index 0000000..58ef198 --- /dev/null +++ b/kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch @@ -0,0 +1,162 @@ +From 239f7bdeef48a3c0b07098617371b9955dc55348 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:36 +0000 +Subject: [PATCH 16/20] iotests: Test mirror with temporarily disabled target + backing file + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-11-kwolf@redhat.com> +Patchwork-id: 94288 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 10/13] iotests: Test mirror with temporarily disabled target backing file +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +The newly tested scenario is a common live storage migration scenario: +The target node is opened without a backing file so that the active +layer is mirrored while its backing chain can be copied in the +background. + +The backing chain should be attached to the mirror target node when +finalising the job, just before switching the users of the source node +to the new copy (at which point the mirror job still has a reference to +the node). drive-mirror did this automatically, but with blockdev-mirror +this is the job of the QMP client. + +This patch adds test cases for two ways to achieve the desired result, +using either x-blockdev-reopen or blockdev-snapshot. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-5-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit 8bdee9f10eac2aefdcc5095feef756354c87bdec) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/155 | 56 +++++++++++++++++++++++++++++++++++++++++----- + tests/qemu-iotests/155.out | 4 ++-- + 2 files changed, 53 insertions(+), 7 deletions(-) + +diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 +index d7ef257..3053e50 100755 +--- a/tests/qemu-iotests/155 ++++ b/tests/qemu-iotests/155 +@@ -45,10 +45,15 @@ target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt) + # image during runtime, only makes sense if + # target_blockdev_backing is not None + # (None: same as target_backing) ++# target_open_with_backing: If True, the target image is added with its backing ++# chain opened right away. If False, blockdev-add ++# opens it without a backing file and job completion ++# is supposed to open the backing chain. + + class BaseClass(iotests.QMPTestCase): + target_blockdev_backing = None + target_real_backing = None ++ target_open_with_backing = True + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, back0_img, '1440K') +@@ -80,9 +85,13 @@ class BaseClass(iotests.QMPTestCase): + options = { 'node-name': 'target', + 'driver': iotests.imgfmt, + 'file': { 'driver': 'file', ++ 'node-name': 'target-file', + 'filename': target_img } } +- if self.target_blockdev_backing: +- options['backing'] = self.target_blockdev_backing ++ ++ if not self.target_open_with_backing: ++ options['backing'] = None ++ elif self.target_blockdev_backing: ++ options['backing'] = self.target_blockdev_backing + + result = self.vm.qmp('blockdev-add', **options) + self.assert_qmp(result, 'return', {}) +@@ -147,10 +156,14 @@ class BaseClass(iotests.QMPTestCase): + # cmd: Mirroring command to execute, either drive-mirror or blockdev-mirror + + class MirrorBaseClass(BaseClass): ++ def openBacking(self): ++ pass ++ + def runMirror(self, sync): + if self.cmd == 'blockdev-mirror': + result = self.vm.qmp(self.cmd, job_id='mirror-job', device='source', +- sync=sync, target='target') ++ sync=sync, target='target', ++ auto_finalize=False) + else: + if self.existing: + mode = 'existing' +@@ -159,11 +172,12 @@ class MirrorBaseClass(BaseClass): + result = self.vm.qmp(self.cmd, job_id='mirror-job', device='source', + sync=sync, target=target_img, + format=iotests.imgfmt, mode=mode, +- node_name='target') ++ node_name='target', auto_finalize=False) + + self.assert_qmp(result, 'return', {}) + +- self.complete_and_wait('mirror-job') ++ self.vm.run_job('mirror-job', use_log=False, auto_finalize=False, ++ pre_finalize=self.openBacking, auto_dismiss=True) + + def testFull(self): + self.runMirror('full') +@@ -221,6 +235,38 @@ class TestBlockdevMirrorForcedBacking(MirrorBaseClass): + target_blockdev_backing = { 'driver': 'null-co' } + target_real_backing = 'null-co://' + ++# Attach the backing chain only during completion, with blockdev-reopen ++class TestBlockdevMirrorReopen(MirrorBaseClass): ++ cmd = 'blockdev-mirror' ++ existing = True ++ target_backing = 'null-co://' ++ target_open_with_backing = False ++ ++ def openBacking(self): ++ if not self.target_open_with_backing: ++ result = self.vm.qmp('blockdev-add', node_name="backing", ++ driver="null-co") ++ self.assert_qmp(result, 'return', {}) ++ result = self.vm.qmp('x-blockdev-reopen', node_name="target", ++ driver=iotests.imgfmt, file="target-file", ++ backing="backing") ++ self.assert_qmp(result, 'return', {}) ++ ++# Attach the backing chain only during completion, with blockdev-snapshot ++class TestBlockdevMirrorSnapshot(MirrorBaseClass): ++ cmd = 'blockdev-mirror' ++ existing = True ++ target_backing = 'null-co://' ++ target_open_with_backing = False ++ ++ def openBacking(self): ++ if not self.target_open_with_backing: ++ result = self.vm.qmp('blockdev-add', node_name="backing", ++ driver="null-co") ++ self.assert_qmp(result, 'return', {}) ++ result = self.vm.qmp('blockdev-snapshot', node="backing", ++ overlay="target") ++ self.assert_qmp(result, 'return', {}) + + class TestCommit(BaseClass): + existing = False +diff --git a/tests/qemu-iotests/155.out b/tests/qemu-iotests/155.out +index 4176bb9..4fd1c2d 100644 +--- a/tests/qemu-iotests/155.out ++++ b/tests/qemu-iotests/155.out +@@ -1,5 +1,5 @@ +-................... ++......................... + ---------------------------------------------------------------------- +-Ran 19 tests ++Ran 25 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Use-complete_and_wait-in-155.patch b/kvm-iotests-Use-complete_and_wait-in-155.patch new file mode 100755 index 0000000..38b41be --- /dev/null +++ b/kvm-iotests-Use-complete_and_wait-in-155.patch @@ -0,0 +1,50 @@ +From 872fbd32d06bda4aba3a7e67a95f76f62e475dbe Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:27 +0000 +Subject: [PATCH 07/20] iotests: Use complete_and_wait() in 155 + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-2-kwolf@redhat.com> +Patchwork-id: 94279 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 01/13] iotests: Use complete_and_wait() in 155 +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Max Reitz + +This way, we get to see errors during the completion phase. + +Signed-off-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200218103454.296704-14-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 6644d0e6192b36cdf2902c9774e1afb8ab2e7223) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/155 | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 +index e194859..d7ef257 100755 +--- a/tests/qemu-iotests/155 ++++ b/tests/qemu-iotests/155 +@@ -163,12 +163,7 @@ class MirrorBaseClass(BaseClass): + + self.assert_qmp(result, 'return', {}) + +- self.vm.event_wait('BLOCK_JOB_READY') +- +- result = self.vm.qmp('block-job-complete', device='mirror-job') +- self.assert_qmp(result, 'return', {}) +- +- self.vm.event_wait('BLOCK_JOB_COMPLETED') ++ self.complete_and_wait('mirror-job') + + def testFull(self): + self.runMirror('full') +-- +1.8.3.1 + diff --git a/kvm-iotests-don-t-use-format-for-drive_add.patch b/kvm-iotests-don-t-use-format-for-drive_add.patch new file mode 100755 index 0000000..f95e17a --- /dev/null +++ b/kvm-iotests-don-t-use-format-for-drive_add.patch @@ -0,0 +1,81 @@ +From 127360c2fa0fefa18ff828bfec3985e04791d665 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:16 +0100 +Subject: [PATCH 17/26] iotests: don't use 'format' for drive_add +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-3-kwolf@redhat.com> +Patchwork-id: 97102 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 02/11] iotests: don't use 'format' for drive_add +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: John Snow + +It shadows (with a different type) the built-in format. +Use something else. + +Signed-off-by: John Snow +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Max Reitz +Message-Id: <20200331000014.11581-3-jsnow@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Max Reitz +(cherry picked from commit 1d3d4b630c6ea8b19420c097f0c448b6ded95072) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 3 ++- + tests/qemu-iotests/iotests.py | 6 +++--- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index c732a11..eb50c9f 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -469,7 +469,8 @@ class TestDriveCompression(iotests.QMPTestCase): + qemu_img('create', '-f', fmt, blockdev_target_img, + str(TestDriveCompression.image_len), *args) + if attach_target: +- self.vm.add_drive(blockdev_target_img, format=fmt, interface="none") ++ self.vm.add_drive(blockdev_target_img, ++ img_format=fmt, interface="none") + + self.vm.launch() + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 46f880c..be20d56 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -481,20 +481,20 @@ class VM(qtest.QEMUQtestMachine): + self._args.append(opts) + return self + +- def add_drive(self, path, opts='', interface='virtio', format=imgfmt): ++ def add_drive(self, path, opts='', interface='virtio', img_format=imgfmt): + '''Add a virtio-blk drive to the VM''' + options = ['if=%s' % interface, + 'id=drive%d' % self._num_drives] + + if path is not None: + options.append('file=%s' % path) +- options.append('format=%s' % format) ++ options.append('format=%s' % img_format) + options.append('cache=%s' % cachemode) + + if opts: + options.append(opts) + +- if format == 'luks' and 'key-secret' not in opts: ++ if img_format == 'luks' and 'key-secret' not in opts: + # default luks support + if luks_default_secret_object not in self._args: + self.add_object(luks_default_secret_object) +-- +1.8.3.1 + diff --git a/kvm-iotests.py-Let-wait_migration-wait-even-more.patch b/kvm-iotests.py-Let-wait_migration-wait-even-more.patch new file mode 100755 index 0000000..cda8037 --- /dev/null +++ b/kvm-iotests.py-Let-wait_migration-wait-even-more.patch @@ -0,0 +1,123 @@ +From d6df1426ae65b3a0d50bdbb1f8a7246386dd6ebf Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:04 +0000 +Subject: [PATCH 07/18] iotests.py: Let wait_migration wait even more + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-7-kwolf@redhat.com> +Patchwork-id: 93751 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/6] iotests.py: Let wait_migration wait even more +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +From: Max Reitz + +The "migration completed" event may be sent (on the source, to be +specific) before the migration is actually completed, so the VM runstate +will still be "finish-migrate" instead of "postmigrate". So ask the +users of VM.wait_migration() to specify the final runstate they desire +and then poll the VM until it has reached that state. (This should be +over very quickly, so busy polling is fine.) + +Without this patch, I see intermittent failures in the new iotest 280 +under high system load. I have not yet seen such failures with other +iotests that use VM.wait_migration() and query-status afterwards, but +maybe they just occur even more rarely, or it is because they also wait +on the destination VM to be running. + +Signed-off-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 8da7969bd7014f6de037d8ae132b40721944b186) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/234 | 8 ++++---- + tests/qemu-iotests/262 | 4 ++-- + tests/qemu-iotests/280 | 2 +- + tests/qemu-iotests/iotests.py | 6 +++++- + 4 files changed, 12 insertions(+), 8 deletions(-) + +diff --git a/tests/qemu-iotests/234 b/tests/qemu-iotests/234 +index 34c818c..59a7f94 100755 +--- a/tests/qemu-iotests/234 ++++ b/tests/qemu-iotests/234 +@@ -69,9 +69,9 @@ with iotests.FilePath('img') as img_path, \ + iotests.log(vm_a.qmp('migrate', uri='exec:cat >%s' % (fifo_a))) + with iotests.Timeout(3, 'Migration does not complete'): + # Wait for the source first (which includes setup=setup) +- vm_a.wait_migration() ++ vm_a.wait_migration('postmigrate') + # Wait for the destination second (which does not) +- vm_b.wait_migration() ++ vm_b.wait_migration('running') + + iotests.log(vm_a.qmp('query-migrate')['return']['status']) + iotests.log(vm_b.qmp('query-migrate')['return']['status']) +@@ -98,9 +98,9 @@ with iotests.FilePath('img') as img_path, \ + iotests.log(vm_b.qmp('migrate', uri='exec:cat >%s' % (fifo_b))) + with iotests.Timeout(3, 'Migration does not complete'): + # Wait for the source first (which includes setup=setup) +- vm_b.wait_migration() ++ vm_b.wait_migration('postmigrate') + # Wait for the destination second (which does not) +- vm_a.wait_migration() ++ vm_a.wait_migration('running') + + iotests.log(vm_a.qmp('query-migrate')['return']['status']) + iotests.log(vm_b.qmp('query-migrate')['return']['status']) +diff --git a/tests/qemu-iotests/262 b/tests/qemu-iotests/262 +index 0963daa..bbcb526 100755 +--- a/tests/qemu-iotests/262 ++++ b/tests/qemu-iotests/262 +@@ -71,9 +71,9 @@ with iotests.FilePath('img') as img_path, \ + iotests.log(vm_a.qmp('migrate', uri='exec:cat >%s' % (fifo))) + with iotests.Timeout(3, 'Migration does not complete'): + # Wait for the source first (which includes setup=setup) +- vm_a.wait_migration() ++ vm_a.wait_migration('postmigrate') + # Wait for the destination second (which does not) +- vm_b.wait_migration() ++ vm_b.wait_migration('running') + + iotests.log(vm_a.qmp('query-migrate')['return']['status']) + iotests.log(vm_b.qmp('query-migrate')['return']['status']) +diff --git a/tests/qemu-iotests/280 b/tests/qemu-iotests/280 +index 0b1fa8e..85e9114 100755 +--- a/tests/qemu-iotests/280 ++++ b/tests/qemu-iotests/280 +@@ -45,7 +45,7 @@ with iotests.FilePath('base') as base_path , \ + vm.qmp_log('migrate', uri='exec:cat > /dev/null') + + with iotests.Timeout(3, 'Migration does not complete'): +- vm.wait_migration() ++ vm.wait_migration('postmigrate') + + iotests.log('\nVM is now stopped:') + iotests.log(vm.qmp('query-migrate')['return']['status']) +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 5741efb..0c55f7b 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -663,12 +663,16 @@ class VM(qtest.QEMUQtestMachine): + } + ])) + +- def wait_migration(self): ++ def wait_migration(self, expect_runstate): + while True: + event = self.event_wait('MIGRATION') + log(event, filters=[filter_qmp_event]) + if event['data']['status'] == 'completed': + break ++ # The event may occur in finish-migrate, so wait for the expected ++ # post-migration runstate ++ while self.qmp('query-status')['return']['status'] != expect_runstate: ++ pass + + def node_info(self, node_name): + nodes = self.qmp('query-named-block-nodes') +-- +1.8.3.1 + diff --git a/kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch b/kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch new file mode 100755 index 0000000..2ee9dcd --- /dev/null +++ b/kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch @@ -0,0 +1,79 @@ +From 1c508d56d154caf5fbf53e7dabafd707236cb16b Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Wed, 29 Jan 2020 13:45:18 +0000 +Subject: [PATCH 06/15] iscsi: Cap block count from GET LBA STATUS + (CVE-2020-1711) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200129134518.1293-2-jmaloy@redhat.com> +Patchwork-id: 93571 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] iscsi: Cap block count from GET LBA STATUS (CVE-2020-1711) +Bugzilla: 1794503 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Acked-by: Philippe Mathieu-Daudé + +From: Felipe Franciosi + +When querying an iSCSI server for the provisioning status of blocks (via +GET LBA STATUS), Qemu only validates that the response descriptor zero's +LBA matches the one requested. Given the SCSI spec allows servers to +respond with the status of blocks beyond the end of the LUN, Qemu may +have its heap corrupted by clearing/setting too many bits at the end of +its allocmap for the LUN. + +A malicious guest in control of the iSCSI server could carefully program +Qemu's heap (by selectively setting the bitmap) and then smash it. + +This limits the number of bits that iscsi_co_block_status() will try to +update in the allocmap so it can't overflow the bitmap. + +Fixes: CVE-2020-1711 +Cc: qemu-stable@nongnu.org +Signed-off-by: Felipe Franciosi +Signed-off-by: Peter Turschmid +Signed-off-by: Raphael Norwitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 693fd2acdf14dd86c0bf852610f1c2cca80a74dc) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + block/iscsi.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/block/iscsi.c b/block/iscsi.c +index 2aea7e3..cbd5729 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -701,7 +701,7 @@ static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs, + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; +- uint64_t lba; ++ uint64_t lba, max_bytes; + int ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); +@@ -721,6 +721,7 @@ static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs, + } + + lba = offset / iscsilun->block_size; ++ max_bytes = (iscsilun->num_blocks - lba) * iscsilun->block_size; + + qemu_mutex_lock(&iscsilun->mutex); + retry: +@@ -764,7 +765,7 @@ retry: + goto out_unlock; + } + +- *pnum = (int64_t) lbasd->num_blocks * iscsilun->block_size; ++ *pnum = MIN((int64_t) lbasd->num_blocks * iscsilun->block_size, max_bytes); + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { +-- +1.8.3.1 + diff --git a/kvm-iscsi-Drop-iscsi_co_create_opts.patch b/kvm-iscsi-Drop-iscsi_co_create_opts.patch new file mode 100755 index 0000000..a6d0baf --- /dev/null +++ b/kvm-iscsi-Drop-iscsi_co_create_opts.patch @@ -0,0 +1,113 @@ +From 58b7d33e1bc17b89103ceaa39f5722a69b35d810 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:45 +0000 +Subject: [PATCH 04/20] iscsi: Drop iscsi_co_create_opts() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-5-mlevitsk@redhat.com> +Patchwork-id: 94226 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 4/6] iscsi: Drop iscsi_co_create_opts() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +The generic fallback implementation effectively does the same. + +Reviewed-by: Maxim Levitsky +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-5-mreitz@redhat.com> +Signed-off-by: Max Reitz +(cherry picked from commit 80f0900905b555f00d644894c786b6d66ac2e00e) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/iscsi.c | 56 -------------------------------------------------------- + 1 file changed, 56 deletions(-) + +diff --git a/block/iscsi.c b/block/iscsi.c +index cbd5729..b45da65 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -2164,58 +2164,6 @@ static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset, + return 0; + } + +-static int coroutine_fn iscsi_co_create_opts(const char *filename, QemuOpts *opts, +- Error **errp) +-{ +- int ret = 0; +- int64_t total_size = 0; +- BlockDriverState *bs; +- IscsiLun *iscsilun = NULL; +- QDict *bs_options; +- Error *local_err = NULL; +- +- bs = bdrv_new(); +- +- /* Read out options */ +- total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +- BDRV_SECTOR_SIZE); +- bs->opaque = g_new0(struct IscsiLun, 1); +- iscsilun = bs->opaque; +- +- bs_options = qdict_new(); +- iscsi_parse_filename(filename, bs_options, &local_err); +- if (local_err) { +- error_propagate(errp, local_err); +- ret = -EINVAL; +- } else { +- ret = iscsi_open(bs, bs_options, 0, NULL); +- } +- qobject_unref(bs_options); +- +- if (ret != 0) { +- goto out; +- } +- iscsi_detach_aio_context(bs); +- if (iscsilun->type != TYPE_DISK) { +- ret = -ENODEV; +- goto out; +- } +- if (bs->total_sectors < total_size) { +- ret = -ENOSPC; +- goto out; +- } +- +- ret = 0; +-out: +- if (iscsilun->iscsi != NULL) { +- iscsi_destroy_context(iscsilun->iscsi); +- } +- g_free(bs->opaque); +- bs->opaque = NULL; +- bdrv_unref(bs); +- return ret; +-} +- + static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) + { + IscsiLun *iscsilun = bs->opaque; +@@ -2486,8 +2434,6 @@ static BlockDriver bdrv_iscsi = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, +- .bdrv_co_create_opts = iscsi_co_create_opts, +- .create_opts = &iscsi_create_opts, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +@@ -2525,8 +2471,6 @@ static BlockDriver bdrv_iser = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, +- .bdrv_co_create_opts = iscsi_co_create_opts, +- .create_opts = &iscsi_create_opts, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +-- +1.8.3.1 + diff --git a/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch b/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch new file mode 100755 index 0000000..e38428b --- /dev/null +++ b/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch @@ -0,0 +1,213 @@ +From 3f16b8a33bd7503cbe857fbeb45fff7301b6bb5f Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:12 +0100 +Subject: [PATCH 1/6] job: take each job's lock individually in job_txn_apply + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-2-kwolf@redhat.com> +Patchwork-id: 94597 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/6] job: take each job's lock individually in job_txn_apply +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +From: Stefan Reiter + +All callers of job_txn_apply hold a single job's lock, but different +jobs within a transaction can have different contexts, thus we need to +lock each one individually before applying the callback function. + +Similar to job_completed_txn_abort this also requires releasing the +caller's context before and reacquiring it after to avoid recursive +locks which might break AIO_WAIT_WHILE in the callback. This is safe, since +existing code would already have to take this into account, lest +job_completed_txn_abort might have broken. + +This also brings to light a different issue: When a callback function in +job_txn_apply moves it's job to a different AIO context, callers will +try to release the wrong lock (now that we re-acquire the lock +correctly, previously it would just continue with the old lock, leaving +the job unlocked for the rest of the return path). Fix this by not caching +the job's context. + +This is only necessary for qmp_block_job_finalize, qmp_job_finalize and +job_exit, since everyone else calls through job_exit. + +One test needed adapting, since it calls job_finalize directly, so it +manually needs to acquire the correct context. + +Signed-off-by: Stefan Reiter +Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit b660a84bbb0eb1a76b505648d31d5e82594fb75e) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 9 +++++++++ + job-qmp.c | 9 +++++++++ + job.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- + tests/test-blockjob.c | 2 ++ + 4 files changed, 60 insertions(+), 10 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index c8d4b51..86eb115 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -4215,7 +4215,16 @@ void qmp_block_job_finalize(const char *id, Error **errp) + } + + trace_qmp_block_job_finalize(job); ++ job_ref(&job->job); + job_finalize(&job->job, errp); ++ ++ /* ++ * Job's context might have changed via job_finalize (and job_txn_apply ++ * automatically acquires the new one), so make sure we release the correct ++ * one. ++ */ ++ aio_context = blk_get_aio_context(job->blk); ++ job_unref(&job->job); + aio_context_release(aio_context); + } + +diff --git a/job-qmp.c b/job-qmp.c +index fbfed25..a201220 100644 +--- a/job-qmp.c ++++ b/job-qmp.c +@@ -114,7 +114,16 @@ void qmp_job_finalize(const char *id, Error **errp) + } + + trace_qmp_job_finalize(job); ++ job_ref(job); + job_finalize(job, errp); ++ ++ /* ++ * Job's context might have changed via job_finalize (and job_txn_apply ++ * automatically acquires the new one), so make sure we release the correct ++ * one. ++ */ ++ aio_context = job->aio_context; ++ job_unref(job); + aio_context_release(aio_context); + } + +diff --git a/job.c b/job.c +index 04409b4..48fc4ad 100644 +--- a/job.c ++++ b/job.c +@@ -136,17 +136,38 @@ static void job_txn_del_job(Job *job) + } + } + +-static int job_txn_apply(JobTxn *txn, int fn(Job *)) ++static int job_txn_apply(Job *job, int fn(Job *)) + { +- Job *job, *next; ++ AioContext *inner_ctx; ++ Job *other_job, *next; ++ JobTxn *txn = job->txn; + int rc = 0; + +- QLIST_FOREACH_SAFE(job, &txn->jobs, txn_list, next) { +- rc = fn(job); ++ /* ++ * Similar to job_completed_txn_abort, we take each job's lock before ++ * applying fn, but since we assume that outer_ctx is held by the caller, ++ * we need to release it here to avoid holding the lock twice - which would ++ * break AIO_WAIT_WHILE from within fn. ++ */ ++ job_ref(job); ++ aio_context_release(job->aio_context); ++ ++ QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) { ++ inner_ctx = other_job->aio_context; ++ aio_context_acquire(inner_ctx); ++ rc = fn(other_job); ++ aio_context_release(inner_ctx); + if (rc) { + break; + } + } ++ ++ /* ++ * Note that job->aio_context might have been changed by calling fn, so we ++ * can't use a local variable to cache it. ++ */ ++ aio_context_acquire(job->aio_context); ++ job_unref(job); + return rc; + } + +@@ -774,11 +795,11 @@ static void job_do_finalize(Job *job) + assert(job && job->txn); + + /* prepare the transaction to complete */ +- rc = job_txn_apply(job->txn, job_prepare); ++ rc = job_txn_apply(job, job_prepare); + if (rc) { + job_completed_txn_abort(job); + } else { +- job_txn_apply(job->txn, job_finalize_single); ++ job_txn_apply(job, job_finalize_single); + } + } + +@@ -824,10 +845,10 @@ static void job_completed_txn_success(Job *job) + assert(other_job->ret == 0); + } + +- job_txn_apply(txn, job_transition_to_pending); ++ job_txn_apply(job, job_transition_to_pending); + + /* If no jobs need manual finalization, automatically do so */ +- if (job_txn_apply(txn, job_needs_finalize) == 0) { ++ if (job_txn_apply(job, job_needs_finalize) == 0) { + job_do_finalize(job); + } + } +@@ -849,9 +870,10 @@ static void job_completed(Job *job) + static void job_exit(void *opaque) + { + Job *job = (Job *)opaque; +- AioContext *ctx = job->aio_context; ++ AioContext *ctx; + +- aio_context_acquire(ctx); ++ job_ref(job); ++ aio_context_acquire(job->aio_context); + + /* This is a lie, we're not quiescent, but still doing the completion + * callbacks. However, completion callbacks tend to involve operations that +@@ -862,6 +884,14 @@ static void job_exit(void *opaque) + + job_completed(job); + ++ /* ++ * Note that calling job_completed can move the job to a different ++ * aio_context, so we cannot cache from above. job_txn_apply takes care of ++ * acquiring the new lock, and we ref/unref to avoid job_completed freeing ++ * the job underneath us. ++ */ ++ ctx = job->aio_context; ++ job_unref(job); + aio_context_release(ctx); + } + +diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c +index 7844c9f..6d857fd 100644 +--- a/tests/test-blockjob.c ++++ b/tests/test-blockjob.c +@@ -368,7 +368,9 @@ static void test_cancel_concluded(void) + aio_poll(qemu_get_aio_context(), true); + assert(job->status == JOB_STATUS_PENDING); + ++ aio_context_acquire(job->aio_context); + job_finalize(job, &error_abort); ++ aio_context_release(job->aio_context); + assert(job->status == JOB_STATUS_CONCLUDED); + + cancel_common(s); +-- +1.8.3.1 + diff --git a/kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch b/kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch new file mode 100755 index 0000000..902af6c --- /dev/null +++ b/kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch @@ -0,0 +1,53 @@ +From e2cafb929acb74377754cb688419575b139b922a Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:47 -0400 +Subject: [PATCH 9/9] lan9118: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-10-jmaloy@redhat.com> +Patchwork-id: 101790 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 9/9] lan9118: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 37cee01784ff0df13e5209517e1b3594a5e792d1) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/lan9118.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c +index ed551f2178..7bb4633f0f 100644 +--- a/hw/net/lan9118.c ++++ b/hw/net/lan9118.c +@@ -667,7 +667,7 @@ static void do_tx_packet(lan9118_state *s) + /* FIXME: Honor TX disable, and allow queueing of packets. */ + if (s->phy_control & 0x4000) { + /* This assumes the receive routine doesn't touch the VLANClient. */ +- lan9118_receive(qemu_get_queue(s->nic), s->txp->data, s->txp->len); ++ qemu_receive_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len); + } else { + qemu_send_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len); + } +-- +2.27.0 + diff --git a/kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch b/kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch new file mode 100755 index 0000000..71a2eac --- /dev/null +++ b/kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch @@ -0,0 +1,47 @@ +From 2687e0348e3e4d377b4f5356e46948dc2b371b6d Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:02 -0400 +Subject: [PATCH 3/7] libqos: pci-pc: use 32-bit write for EJ register +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-3-jmaloy@redhat.com> +Patchwork-id: 101484 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 2/6] libqos: pci-pc: use 32-bit write for EJ register +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Paolo Bonzini + +The memory region ops have min_access_size == 4 so obey it. + +Tested-by: Thomas Huth +Signed-off-by: Paolo Bonzini + +(cherry picked from commit 4b7c06837ae0b1ff56473202a42e7e386f53d6db) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + tests/libqos/pci-pc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/libqos/pci-pc.c b/tests/libqos/pci-pc.c +index 0bc591d1da..3bb2eb3ba8 100644 +--- a/tests/libqos/pci-pc.c ++++ b/tests/libqos/pci-pc.c +@@ -186,7 +186,7 @@ void qpci_unplug_acpi_device_test(QTestState *qts, const char *id, uint8_t slot) + g_assert(!qdict_haskey(response, "error")); + qobject_unref(response); + +- qtest_outb(qts, ACPI_PCIHP_ADDR + PCI_EJ_BASE, 1 << slot); ++ qtest_outl(qts, ACPI_PCIHP_ADDR + PCI_EJ_BASE, 1 << slot); + + qtest_qmp_eventwait(qts, "DEVICE_DELETED"); + } +-- +2.27.0 + diff --git a/kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch b/kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch new file mode 100755 index 0000000..424a60c --- /dev/null +++ b/kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch @@ -0,0 +1,48 @@ +From 6320b4e76965b1cf64da4307f4d313fe6b2aa971 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:01 -0400 +Subject: [PATCH 2/7] libqos: usb-hcd-ehci: use 32-bit write for config + register +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-2-jmaloy@redhat.com> +Patchwork-id: 101478 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 1/6] libqos: usb-hcd-ehci: use 32-bit write for config register +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Paolo Bonzini + +The memory region ops have min_access_size == 4 so obey it. + +Tested-by: Thomas Huth +Signed-off-by: Paolo Bonzini + +(cherry picked from commit 89ed83d8b23c11d250c290593cad3ca839d5b053) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + tests/usb-hcd-ehci-test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/usb-hcd-ehci-test.c b/tests/usb-hcd-ehci-test.c +index 5251d539e9..c51e8bb223 100644 +--- a/tests/usb-hcd-ehci-test.c ++++ b/tests/usb-hcd-ehci-test.c +@@ -96,7 +96,7 @@ static void pci_ehci_port_1(void) + static void pci_ehci_config(void) + { + /* hands over all ports from companion uhci to ehci */ +- qpci_io_writew(ehci1.dev, ehci1.bar, 0x60, 1); ++ qpci_io_writel(ehci1.dev, ehci1.bar, 0x60, 1); + } + + static void pci_uhci_port_2(void) +-- +2.27.0 + diff --git a/kvm-libvhost-user-Fix-some-memtable-remap-cases.patch b/kvm-libvhost-user-Fix-some-memtable-remap-cases.patch new file mode 100755 index 0000000..e362efe --- /dev/null +++ b/kvm-libvhost-user-Fix-some-memtable-remap-cases.patch @@ -0,0 +1,117 @@ +From ee360b70f179cf540faebe7e55b34e323e2bb179 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:09 +0100 +Subject: [PATCH 098/116] libvhost-user: Fix some memtable remap cases +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-95-dgilbert@redhat.com> +Patchwork-id: 93548 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 094/112] libvhost-user: Fix some memtable remap cases +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +If a new setmemtable command comes in once the vhost threads are +running, it will remap the guests address space and the threads +will now be looking in the wrong place. + +Fortunately we're running this command under lock, so we can +update the queue mappings so that threads will look in the new-right +place. + +Note: This doesn't fix things that the threads might be doing +without a lock (e.g. a readv/writev!) That's for another time. + +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 49e9ec749d4db62ae51f76354143cee183912a1d) +Signed-off-by: Miroslav Rezanina +--- + contrib/libvhost-user/libvhost-user.c | 33 +++++++++++++++++++++++++-------- + contrib/libvhost-user/libvhost-user.h | 3 +++ + 2 files changed, 28 insertions(+), 8 deletions(-) + +diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c +index 63e4106..b89bf18 100644 +--- a/contrib/libvhost-user/libvhost-user.c ++++ b/contrib/libvhost-user/libvhost-user.c +@@ -565,6 +565,21 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) + } + + static bool ++map_ring(VuDev *dev, VuVirtq *vq) ++{ ++ vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); ++ vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); ++ vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); ++ ++ DPRINT("Setting virtq addresses:\n"); ++ DPRINT(" vring_desc at %p\n", vq->vring.desc); ++ DPRINT(" vring_used at %p\n", vq->vring.used); ++ DPRINT(" vring_avail at %p\n", vq->vring.avail); ++ ++ return !(vq->vring.desc && vq->vring.used && vq->vring.avail); ++} ++ ++static bool + vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) + { + int i; +@@ -767,6 +782,14 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) + close(vmsg->fds[i]); + } + ++ for (i = 0; i < dev->max_queues; i++) { ++ if (dev->vq[i].vring.desc) { ++ if (map_ring(dev, &dev->vq[i])) { ++ vu_panic(dev, "remaping queue %d during setmemtable", i); ++ } ++ } ++ } ++ + return false; + } + +@@ -853,18 +876,12 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) + DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", vra->avail_user_addr); + DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", vra->log_guest_addr); + ++ vq->vra = *vra; + vq->vring.flags = vra->flags; +- vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); +- vq->vring.used = qva_to_va(dev, vra->used_user_addr); +- vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); + vq->vring.log_guest_addr = vra->log_guest_addr; + +- DPRINT("Setting virtq addresses:\n"); +- DPRINT(" vring_desc at %p\n", vq->vring.desc); +- DPRINT(" vring_used at %p\n", vq->vring.used); +- DPRINT(" vring_avail at %p\n", vq->vring.avail); + +- if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { ++ if (map_ring(dev, vq)) { + vu_panic(dev, "Invalid vring_addr message"); + return false; + } +diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h +index 1844b6f..5cb7708 100644 +--- a/contrib/libvhost-user/libvhost-user.h ++++ b/contrib/libvhost-user/libvhost-user.h +@@ -327,6 +327,9 @@ typedef struct VuVirtq { + int err_fd; + unsigned int enable; + bool started; ++ ++ /* Guest addresses of our ring */ ++ struct vhost_vring_addr vra; + } VuVirtq; + + enum VuWatchCondtion { +-- +1.8.3.1 + diff --git a/kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch b/kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch new file mode 100755 index 0000000..0e55df4 --- /dev/null +++ b/kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch @@ -0,0 +1,290 @@ +From cadb72854b44f53c07ea60d7a6149ccac5928a82 Mon Sep 17 00:00:00 2001 +From: Claudio Imbrenda +Date: Tue, 27 Oct 2020 12:02:15 -0400 +Subject: [PATCH 02/18] libvhost-user: handle endianness as mandated by the + spec + +RH-Author: Claudio Imbrenda +Message-id: <20201027120217.2997314-2-cimbrend@redhat.com> +Patchwork-id: 98723 +O-Subject: [RHEL8.4 qemu-kvm PATCH 1/3] libvhost-user: handle endianness as mandated by the spec +Bugzilla: 1857733 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Cornelia Huck + +From: Marc Hartmayer + +Since virtio existed even before it got standardized, the virtio +standard defines the following types of virtio devices: + + + legacy device (pre-virtio 1.0) + + non-legacy or VIRTIO 1.0 device + + transitional device (which can act both as legacy and non-legacy) + +Virtio 1.0 defines the fields of the virtqueues as little endian, +while legacy uses guest's native endian [1]. Currently libvhost-user +does not handle virtio endianness at all, i.e. it works only if the +native endianness matches with whatever is actually needed. That means +things break spectacularly on big-endian targets. Let us handle virtio +endianness for non-legacy as required by the virtio specification [1] +and fence legacy virtio, as there is no safe way to figure out the +needed endianness conversions for all cases. The fencing of legacy +virtio devices is done in `vu_set_features_exec`. + +[1] https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-210003 + +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Marc Hartmayer +Message-id: 20200901150019.29229-3-mhartmay@linux.ibm.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 2ffc54708087c6e524297957be2fc5d543abb767) +Signed-off-by: Danilo C. L. de Paula +--- + contrib/libvhost-user/libvhost-user.c | 77 +++++++++++++++------------ + 1 file changed, 43 insertions(+), 34 deletions(-) + +diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c +index b89bf185013..b8350b067e3 100644 +--- a/contrib/libvhost-user/libvhost-user.c ++++ b/contrib/libvhost-user/libvhost-user.c +@@ -42,6 +42,7 @@ + + #include "qemu/atomic.h" + #include "qemu/osdep.h" ++#include "qemu/bswap.h" + #include "qemu/memfd.h" + + #include "libvhost-user.h" +@@ -522,6 +523,14 @@ vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + dev->features = vmsg->payload.u64; ++ if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { ++ /* ++ * We only support devices conforming to VIRTIO 1.0 or ++ * later ++ */ ++ vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); ++ return false; ++ } + + if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { + vu_set_enable_all_rings(dev, true); +@@ -886,7 +895,7 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) + return false; + } + +- vq->used_idx = vq->vring.used->idx; ++ vq->used_idx = lduw_le_p(&vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + bool resume = dev->iface->queue_is_processed_in_order && +@@ -998,7 +1007,7 @@ vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) + return 0; + } + +- vq->used_idx = vq->vring.used->idx; ++ vq->used_idx = lduw_le_p(&vq->vring.used->idx); + vq->resubmit_num = 0; + vq->resubmit_list = NULL; + vq->counter = 0; +@@ -1737,13 +1746,13 @@ vu_queue_started(const VuDev *dev, const VuVirtq *vq) + static inline uint16_t + vring_avail_flags(VuVirtq *vq) + { +- return vq->vring.avail->flags; ++ return lduw_le_p(&vq->vring.avail->flags); + } + + static inline uint16_t + vring_avail_idx(VuVirtq *vq) + { +- vq->shadow_avail_idx = vq->vring.avail->idx; ++ vq->shadow_avail_idx = lduw_le_p(&vq->vring.avail->idx); + + return vq->shadow_avail_idx; + } +@@ -1751,7 +1760,7 @@ vring_avail_idx(VuVirtq *vq) + static inline uint16_t + vring_avail_ring(VuVirtq *vq, int i) + { +- return vq->vring.avail->ring[i]; ++ return lduw_le_p(&vq->vring.avail->ring[i]); + } + + static inline uint16_t +@@ -1839,12 +1848,12 @@ virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) + { + /* If this descriptor says it doesn't chain, we're done. */ +- if (!(desc[i].flags & VRING_DESC_F_NEXT)) { ++ if (!(lduw_le_p(&desc[i].flags) & VRING_DESC_F_NEXT)) { + return VIRTQUEUE_READ_DESC_DONE; + } + + /* Check they're not leading us off end of descriptors. */ +- *next = desc[i].next; ++ *next = lduw_le_p(&desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + +@@ -1887,8 +1896,8 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + } + desc = vq->vring.desc; + +- if (desc[i].flags & VRING_DESC_F_INDIRECT) { +- if (desc[i].len % sizeof(struct vring_desc)) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_INDIRECT) { ++ if (ldl_le_p(&desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + goto err; + } +@@ -1901,8 +1910,8 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + + /* loop over the indirect descriptor table */ + indirect = 1; +- desc_addr = desc[i].addr; +- desc_len = desc[i].len; ++ desc_addr = ldq_le_p(&desc[i].addr); ++ desc_len = ldl_le_p(&desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); +@@ -1929,10 +1938,10 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + goto err; + } + +- if (desc[i].flags & VRING_DESC_F_WRITE) { +- in_total += desc[i].len; ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_WRITE) { ++ in_total += ldl_le_p(&desc[i].len); + } else { +- out_total += desc[i].len; ++ out_total += ldl_le_p(&desc[i].len); + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; +@@ -2047,7 +2056,7 @@ vring_used_flags_set_bit(VuVirtq *vq, int mask) + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); +- *flags |= mask; ++ stw_le_p(flags, lduw_le_p(flags) | mask); + } + + static inline void +@@ -2057,7 +2066,7 @@ vring_used_flags_unset_bit(VuVirtq *vq, int mask) + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); +- *flags &= ~mask; ++ stw_le_p(flags, lduw_le_p(flags) & ~mask); + } + + static inline void +@@ -2067,7 +2076,7 @@ vring_set_avail_event(VuVirtq *vq, uint16_t val) + return; + } + +- *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val; ++ stw_le_p(&vq->vring.used->ring[vq->vring.num], val); + } + + void +@@ -2156,14 +2165,14 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + int rc; + +- if (desc[i].flags & VRING_DESC_F_INDIRECT) { +- if (desc[i].len % sizeof(struct vring_desc)) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_INDIRECT) { ++ if (ldl_le_p(&desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ +- desc_addr = desc[i].addr; +- desc_len = desc[i].len; ++ desc_addr = ldq_le_p(&desc[i].addr); ++ desc_len = ldl_le_p(&desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); +@@ -2185,10 +2194,10 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) + + /* Collect all the descriptors */ + do { +- if (desc[i].flags & VRING_DESC_F_WRITE) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_WRITE) { + virtqueue_map_desc(dev, &in_num, iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, +- desc[i].addr, desc[i].len); ++ ldq_le_p(&desc[i].addr), ldl_le_p(&desc[i].len)); + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); +@@ -2196,7 +2205,7 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) + } + virtqueue_map_desc(dev, &out_num, iov, + VIRTQUEUE_MAX_SIZE, false, +- desc[i].addr, desc[i].len); ++ ldq_le_p(&desc[i].addr), ldl_le_p(&desc[i].len)); + } + + /* If we've got too many, that implies a descriptor loop. */ +@@ -2392,14 +2401,14 @@ vu_log_queue_fill(VuDev *dev, VuVirtq *vq, + max = vq->vring.num; + i = elem->index; + +- if (desc[i].flags & VRING_DESC_F_INDIRECT) { +- if (desc[i].len % sizeof(struct vring_desc)) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_INDIRECT) { ++ if (ldl_le_p(&desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ +- desc_addr = desc[i].addr; +- desc_len = desc[i].len; ++ desc_addr = ldq_le_p(&desc[i].addr); ++ desc_len = ldl_le_p(&desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); +@@ -2425,9 +2434,9 @@ vu_log_queue_fill(VuDev *dev, VuVirtq *vq, + return; + } + +- if (desc[i].flags & VRING_DESC_F_WRITE) { +- min = MIN(desc[i].len, len); +- vu_log_write(dev, desc[i].addr, min); ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_WRITE) { ++ min = MIN(ldl_le_p(&desc[i].len), len); ++ vu_log_write(dev, ldq_le_p(&desc[i].addr), min); + len -= min; + } + +@@ -2452,15 +2461,15 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq, + + idx = (idx + vq->used_idx) % vq->vring.num; + +- uelem.id = elem->index; +- uelem.len = len; ++ stl_le_p(&uelem.id, elem->index); ++ stl_le_p(&uelem.len, len); + vring_used_write(dev, vq, &uelem, idx); + } + + static inline + void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) + { +- vq->vring.used->idx = val; ++ stw_le_p(&vq->vring.used->idx, val); + vu_log_write(dev, + vq->vring.log_guest_addr + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); +-- +2.27.0 + diff --git a/kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch b/kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch new file mode 100755 index 0000000..d9c81cf --- /dev/null +++ b/kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch @@ -0,0 +1,43 @@ +From f844ca939adb619cce8426e104b0039a7eba70a6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 11 May 2021 11:24:04 -0400 +Subject: [PATCH 1/5] linux-headers: Add VFIO_CCW_REQ_IRQ_INDEX + +RH-Author: Thomas Huth +Message-id: <20210511112405.297037-2-thuth@redhat.com> +Patchwork-id: 101537 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] linux-headers: Add VFIO_CCW_REQ_IRQ_INDEX +Bugzilla: 1940450 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1940450 +Upstream-status: N/A + +This is based on upstream commit b3c818a47f ("Update linux headers to +5.11-rc2"), but has been reduced to the single hunk that is required +for the next patch (there were too many unrelated conflicts in the other +files for doing full backport of the original upstream commit). + +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/vfio.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index f660bd7bac..9c8810bef4 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -580,6 +580,7 @@ enum { + enum { + VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, ++ VFIO_CCW_REQ_IRQ_INDEX, + VFIO_CCW_NUM_IRQS + }; + +-- +2.27.0 + diff --git a/kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch b/kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch new file mode 100755 index 0000000..1217a6c --- /dev/null +++ b/kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch @@ -0,0 +1,83 @@ +From d9a63d12b5804eb172a040a16d7e725853c41a8c Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:12 -0500 +Subject: [PATCH 12/18] linux-headers: Partial update against Linux 5.9-rc4 + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-9-thuth@redhat.com> +Patchwork-id: 99505 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 08/12] linux-headers: Partial update against Linux 5.9-rc4 +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Upstream-status: N/A + +This is based on upstream commit e6546342a830e520d14ef03aa95677611de0d90c +but only the two files have been included (there were too many conflicts +in the other unrelated files, so they have been dropped from this patch). + +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/asm-s390/kvm.h | 7 +++++-- + linux-headers/linux/kvm.h | 6 ++++++ + 2 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/linux-headers/asm-s390/kvm.h b/linux-headers/asm-s390/kvm.h +index 0138ccb0d89..f053b8304a8 100644 +--- a/linux-headers/asm-s390/kvm.h ++++ b/linux-headers/asm-s390/kvm.h +@@ -231,11 +231,13 @@ struct kvm_guest_debug_arch { + #define KVM_SYNC_GSCB (1UL << 9) + #define KVM_SYNC_BPBC (1UL << 10) + #define KVM_SYNC_ETOKEN (1UL << 11) ++#define KVM_SYNC_DIAG318 (1UL << 12) + + #define KVM_SYNC_S390_VALID_FIELDS \ + (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ + KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ +- KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) ++ KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ ++ KVM_SYNC_DIAG318) + + /* length and alignment of the sdnx as a power of two */ + #define SDNXC 8 +@@ -264,7 +266,8 @@ struct kvm_sync_regs { + __u8 reserved2 : 7; + __u8 padding1[51]; /* riccb needs to be 64byte aligned */ + __u8 riccb[64]; /* runtime instrumentation controls block */ +- __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ ++ __u64 diag318; /* diagnose 0x318 info */ ++ __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ + union { + __u8 sdnx[SDNXL]; /* state description annex */ + struct { +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 578cd97c0d9..6bba4ec136b 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -276,6 +276,7 @@ struct kvm_run { + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; ++ __u32 cpu; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { +@@ -1011,6 +1012,11 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_S390_VCPU_RESETS 179 + #define KVM_CAP_S390_PROTECTED 180 + #define KVM_CAP_PPC_SECURE_GUEST 181 ++#define KVM_CAP_HALT_POLL 182 ++#define KVM_CAP_ASYNC_PF_INT 183 ++#define KVM_CAP_LAST_CPU 184 ++#define KVM_CAP_SMALLER_MAXPHYADDR 185 ++#define KVM_CAP_S390_DIAG318 186 + + #ifdef KVM_CAP_IRQ_ROUTING + +-- +2.27.0 + diff --git a/kvm-linux-headers-add-vfio-DMA-available-capability.patch b/kvm-linux-headers-add-vfio-DMA-available-capability.patch new file mode 100755 index 0000000..f62026d --- /dev/null +++ b/kvm-linux-headers-add-vfio-DMA-available-capability.patch @@ -0,0 +1,54 @@ +From b50c47e1a9fbe8876e231afbb5ed85945c8038da Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:40 -0500 +Subject: [PATCH 1/7] linux-headers: add vfio DMA available capability + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-2-cohuck@redhat.com> +Patchwork-id: 100674 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/7] linux-headers: add vfio DMA available capability +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +UPSTREAM: RHEL only + +This is the part of 53ba2eee52bf ("linux-headers: update against +5.10-rc1") required for DMA limiting. + +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/vfio.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index 9e227348b30..f660bd7bace 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -751,6 +751,21 @@ struct vfio_iommu_type1_info_cap_iova_range { + struct vfio_iova_range iova_ranges[]; + }; + ++/* ++ * The DMA available capability allows to report the current number of ++ * simultaneously outstanding DMA mappings that are allowed. ++ * ++ * The structure below defines version 1 of this capability. ++ * ++ * avail: specifies the current number of outstanding DMA mappings allowed. ++ */ ++#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 ++ ++struct vfio_iommu_type1_info_dma_avail { ++ struct vfio_info_cap_header header; ++ __u32 avail; ++}; ++ + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + + /** +-- +2.27.0 + diff --git a/kvm-linux-headers-support-vfio-ccw-features.patch b/kvm-linux-headers-support-vfio-ccw-features.patch new file mode 100755 index 0000000..4eb95bf --- /dev/null +++ b/kvm-linux-headers-support-vfio-ccw-features.patch @@ -0,0 +1,77 @@ +From 1da0eecb9f2086c880fdaf1260ae775bbfbf5f02 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:37 -0400 +Subject: [PATCH 03/12] linux-headers: support vfio-ccw features + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-4-cohuck@redhat.com> +Patchwork-id: 97696 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 3/9] linux-headers: support vfio-ccw features +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +Partial update to support CRW and SCHIB regions. + +Upstream: n/a + +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/vfio.h | 3 +++ + linux-headers/linux/vfio_ccw.h | 19 +++++++++++++++++++ + 2 files changed, 22 insertions(+) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index fb10370d29..9e227348b3 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -378,6 +378,8 @@ struct vfio_region_gfx_edid { + + /* sub-types for VFIO_REGION_TYPE_CCW */ + #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) ++#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) ++#define VFIO_REGION_SUBTYPE_CCW_CRW (3) + + /* + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped +@@ -577,6 +579,7 @@ enum { + + enum { + VFIO_CCW_IO_IRQ_INDEX, ++ VFIO_CCW_CRW_IRQ_INDEX, + VFIO_CCW_NUM_IRQS + }; + +diff --git a/linux-headers/linux/vfio_ccw.h b/linux-headers/linux/vfio_ccw.h +index fcc3e69ef5..6375d6ff25 100644 +--- a/linux-headers/linux/vfio_ccw.h ++++ b/linux-headers/linux/vfio_ccw.h +@@ -34,4 +34,23 @@ struct ccw_cmd_region { + __u32 ret_code; + } __attribute__((packed)); + ++/* ++ * Used for processing commands that read the subchannel-information block ++ * Reading this region triggers a stsch() to hardware ++ * Note: this is controlled by a capability ++ */ ++struct ccw_schib_region { ++#define SCHIB_AREA_SIZE 52 ++ __u8 schib_area[SCHIB_AREA_SIZE]; ++} __attribute__((packed)); ++ ++/* ++ * Used for returning a Channel Report Word to userspace. ++ * Note: this is controlled by a capability ++ */ ++struct ccw_crw_region { ++ __u32 crw; ++ __u32 pad; ++} __attribute__((packed)); ++ + #endif +-- +2.27.0 + diff --git a/kvm-linux-headers-update-kvm.h.patch b/kvm-linux-headers-update-kvm.h.patch new file mode 100755 index 0000000..1834e33 --- /dev/null +++ b/kvm-linux-headers-update-kvm.h.patch @@ -0,0 +1,119 @@ +From 9d1b94d3739567245578f30866facc13edb3be92 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:44 -0400 +Subject: [PATCH 02/42] linux-headers: update kvm.h + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-3-thuth@redhat.com> +Patchwork-id: 97020 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 02/38] linux-headers: update kvm.h +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Upstream-status: n/a + +Update kvm.h for the upcoming new s390x reset and protected virtualization +ioctls. This patch is based on commit ddda37483dd17c9936fdde9ebf8f6ca2692b3842 +and commit dc6f8d458a4ccc360723993f31d310d06469f55f, but I dropped all +(unrequired) changes to the other linux-header files. + +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/kvm.h | 55 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 53 insertions(+), 2 deletions(-) + +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 3d9b18f7f8..578cd97c0d 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -468,12 +468,17 @@ struct kvm_s390_mem_op { + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ +- __u8 ar; /* the access register number */ +- __u8 reserved[31]; /* should be set to 0 */ ++ union { ++ __u8 ar; /* the access register number */ ++ __u32 sida_offset; /* offset into the sida */ ++ __u8 reserved[32]; /* should be set to 0 */ ++ }; + }; + /* types for kvm_s390_mem_op->op */ + #define KVM_S390_MEMOP_LOGICAL_READ 0 + #define KVM_S390_MEMOP_LOGICAL_WRITE 1 ++#define KVM_S390_MEMOP_SIDA_READ 2 ++#define KVM_S390_MEMOP_SIDA_WRITE 3 + /* flags for kvm_s390_mem_op->flags */ + #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) + #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +@@ -1000,6 +1005,12 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_PMU_EVENT_FILTER 173 + #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 + #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 ++#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 ++#define KVM_CAP_ARM_NISV_TO_USER 177 ++#define KVM_CAP_ARM_INJECT_EXT_DABT 178 ++#define KVM_CAP_S390_VCPU_RESETS 179 ++#define KVM_CAP_S390_PROTECTED 180 ++#define KVM_CAP_PPC_SECURE_GUEST 181 + + #ifdef KVM_CAP_IRQ_ROUTING + +@@ -1461,6 +1472,43 @@ struct kvm_enc_region { + /* Available with KVM_CAP_ARM_SVE */ + #define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int) + ++/* Available with KVM_CAP_S390_VCPU_RESETS */ ++#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) ++#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) ++ ++struct kvm_s390_pv_sec_parm { ++ __u64 origin; ++ __u64 length; ++}; ++ ++struct kvm_s390_pv_unp { ++ __u64 addr; ++ __u64 size; ++ __u64 tweak; ++}; ++ ++enum pv_cmd_id { ++ KVM_PV_ENABLE, ++ KVM_PV_DISABLE, ++ KVM_PV_SET_SEC_PARMS, ++ KVM_PV_UNPACK, ++ KVM_PV_VERIFY, ++ KVM_PV_PREP_RESET, ++ KVM_PV_UNSHARE_ALL, ++}; ++ ++struct kvm_pv_cmd { ++ __u32 cmd; /* Command to be executed */ ++ __u16 rc; /* Ultravisor return code */ ++ __u16 rrc; /* Ultravisor return reason code */ ++ __u64 data; /* Data or address */ ++ __u32 flags; /* flags for future extensions. Must be 0 for now */ ++ __u32 reserved[3]; ++}; ++ ++/* Available with KVM_CAP_S390_PROTECTED */ ++#define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) ++ + /* Secure Encrypted Virtualization command */ + enum sev_cmd_id { + /* Guest initialization commands */ +@@ -1611,4 +1659,7 @@ struct kvm_hyperv_eventfd { + #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff + #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + ++#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) ++#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) ++ + #endif /* __LINUX_KVM_H */ +-- +2.27.0 + diff --git a/kvm-memory-Add-IOMMUTLBEvent.patch b/kvm-memory-Add-IOMMUTLBEvent.patch new file mode 100755 index 0000000..5d73c97 --- /dev/null +++ b/kvm-memory-Add-IOMMUTLBEvent.patch @@ -0,0 +1,590 @@ +From 43a460bde62359c3fa2b1fc6c90d9e13ee7b9a6c Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:35 -0500 +Subject: [PATCH 11/17] memory: Add IOMMUTLBEvent +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-11-eperezma@redhat.com> +Patchwork-id: 100603 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 10/13] memory: Add IOMMUTLBEvent +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +This way we can tell between regular IOMMUTLBEntry (entry of IOMMU +hardware) and notifications. + +In the notifications, we set explicitly if it is a MAPs or an UNMAP, +instead of trusting in entry permissions to differentiate them. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-3-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Matthew Rosato +Acked-by: David Gibson +(cherry picked from commit 5039caf3c449c49e625d34e134463260cf8e00e0) + +Conflicts: + hw/s390x/s390-pci-inst.c: Context because of the lack of commit + ("37fa32de707 s390x/pci: Honor DMA limits set by vfio"). + hw/virtio/virtio-iommu.c: It does not exist in rhel. + +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 13 +++--- + hw/arm/smmuv3.c | 13 +++--- + hw/i386/intel_iommu.c | 88 ++++++++++++++++++++++------------------ + hw/misc/tz-mpc.c | 32 ++++++++------- + hw/ppc/spapr_iommu.c | 15 +++---- + hw/s390x/s390-pci-inst.c | 27 +++++++----- + include/exec/memory.h | 27 ++++++------ + memory.c | 20 ++++----- + 8 files changed, 127 insertions(+), 108 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index dfabe381182..a519c97614a 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -464,14 +464,15 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + /* Unmap the whole notifier's range */ + static void smmu_unmap_notifier_range(IOMMUNotifier *n) + { +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + +- entry.target_as = &address_space_memory; +- entry.iova = n->start; +- entry.perm = IOMMU_NONE; +- entry.addr_mask = n->end - n->start; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &address_space_memory; ++ event.entry.iova = n->start; ++ event.entry.perm = IOMMU_NONE; ++ event.entry.addr_mask = n->end - n->start; + +- memory_region_notify_iommu_one(n, &entry); ++ memory_region_notify_iommu_one(n, &event); + } + + /* Unmap all notifiers attached to @mr */ +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ef8a877c5d8..10b8393beeb 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -783,7 +783,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + uint8_t granule = tg; + + if (!tg) { +@@ -806,12 +806,13 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + granule = tt->granule_sz; + } + +- entry.target_as = &address_space_memory; +- entry.iova = iova; +- entry.addr_mask = num_pages * (1 << granule) - 1; +- entry.perm = IOMMU_NONE; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &address_space_memory; ++ event.entry.iova = iova; ++ event.entry.addr_mask = num_pages * (1 << granule) - 1; ++ event.entry.perm = IOMMU_NONE; + +- memory_region_notify_iommu_one(n, &entry); ++ memory_region_notify_iommu_one(n, &event); + } + + /* invalidate an asid/iova range tuple in all mr's */ +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 463f107ad12..9fedbac82de 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -1016,7 +1016,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, + } + } + +-typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); ++typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); + + /** + * Constant information used during page walking +@@ -1037,11 +1037,12 @@ typedef struct { + uint16_t domain_id; + } vtd_page_walk_info; + +-static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) ++static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) + { + VTDAddressSpace *as = info->as; + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; ++ IOMMUTLBEntry *entry = &event->entry; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, +@@ -1050,7 +1051,7 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + +- if (entry->perm == IOMMU_NONE && !info->notify_unmap) { ++ if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } +@@ -1058,7 +1059,7 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + assert(hook_fn); + + /* Update local IOVA mapped ranges */ +- if (entry->perm) { ++ if (event->type == IOMMU_NOTIFIER_MAP) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { +@@ -1084,19 +1085,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + int ret; + + /* Emulate an UNMAP */ ++ event->type = IOMMU_NOTIFIER_UNMAP; + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); +- ret = hook_fn(entry, private); ++ ret = hook_fn(event, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); +- /* Recover the correct permission */ ++ /* Recover the correct type */ ++ event->type = IOMMU_NOTIFIER_MAP; + entry->perm = cache_perm; + } + } +@@ -1113,7 +1116,7 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); +- return hook_fn(entry, private); ++ return hook_fn(event, private); + } + + /** +@@ -1134,7 +1137,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + uint32_t offset; + uint64_t slpte; + uint64_t subpage_size, subpage_mask; +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + uint64_t iova = start; + uint64_t iova_next; + int ret = 0; +@@ -1188,13 +1191,15 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + * + * In either case, we send an IOTLB notification down. + */ +- entry.target_as = &address_space_memory; +- entry.iova = iova & subpage_mask; +- entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); +- entry.addr_mask = ~subpage_mask; ++ event.entry.target_as = &address_space_memory; ++ event.entry.iova = iova & subpage_mask; ++ event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); ++ event.entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ +- entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); +- ret = vtd_page_walk_one(&entry, info); ++ event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); ++ event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : ++ IOMMU_NOTIFIER_UNMAP; ++ ret = vtd_page_walk_one(&event, info); + } + + if (ret < 0) { +@@ -1373,10 +1378,10 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, + return 0; + } + +-static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, ++static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event, + void *private) + { +- memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); ++ memory_region_notify_iommu(private, 0, *event); + return 0; + } + +@@ -1936,14 +1941,17 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, + * page tables. We just deliver the PSI down to + * invalidate caches. + */ +- IOMMUTLBEntry entry = { +- .target_as = &address_space_memory, +- .iova = addr, +- .translated_addr = 0, +- .addr_mask = size - 1, +- .perm = IOMMU_NONE, ++ IOMMUTLBEvent event = { ++ .type = IOMMU_NOTIFIER_UNMAP, ++ .entry = { ++ .target_as = &address_space_memory, ++ .iova = addr, ++ .translated_addr = 0, ++ .addr_mask = size - 1, ++ .perm = IOMMU_NONE, ++ }, + }; +- memory_region_notify_iommu(&vtd_as->iommu, 0, entry); ++ memory_region_notify_iommu(&vtd_as->iommu, 0, event); + } + } + } +@@ -2355,7 +2363,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) + { + VTDAddressSpace *vtd_dev_as; +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + struct VTDBus *vtd_bus; + hwaddr addr; + uint64_t sz; +@@ -2403,12 +2411,13 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + sz = VTD_PAGE_SIZE; + } + +- entry.target_as = &vtd_dev_as->as; +- entry.addr_mask = sz - 1; +- entry.iova = addr; +- entry.perm = IOMMU_NONE; +- entry.translated_addr = 0; +- memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &vtd_dev_as->as; ++ event.entry.addr_mask = sz - 1; ++ event.entry.iova = addr; ++ event.entry.perm = IOMMU_NONE; ++ event.entry.translated_addr = 0; ++ memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); + + done: + return true; +@@ -3419,19 +3428,20 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + size = remain = end - start + 1; + + while (remain >= VTD_PAGE_SIZE) { +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits); + + assert(mask); + +- entry.iova = start; +- entry.addr_mask = mask - 1; +- entry.target_as = &address_space_memory; +- entry.perm = IOMMU_NONE; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.iova = start; ++ event.entry.addr_mask = mask - 1; ++ event.entry.target_as = &address_space_memory; ++ event.entry.perm = IOMMU_NONE; + /* This field is meaningless for unmap */ +- entry.translated_addr = 0; ++ event.entry.translated_addr = 0; + +- memory_region_notify_iommu_one(n, &entry); ++ memory_region_notify_iommu_one(n, &event); + + start += mask; + remain -= mask; +@@ -3467,9 +3477,9 @@ static void vtd_address_space_refresh_all(IntelIOMMUState *s) + vtd_switch_address_space_all(s); + } + +-static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) ++static int vtd_replay_hook(IOMMUTLBEvent *event, void *private) + { +- memory_region_notify_iommu_one((IOMMUNotifier *)private, entry); ++ memory_region_notify_iommu_one(private, event); + return 0; + } + +diff --git a/hw/misc/tz-mpc.c b/hw/misc/tz-mpc.c +index 49dd6050bd3..e2fbd1065d8 100644 +--- a/hw/misc/tz-mpc.c ++++ b/hw/misc/tz-mpc.c +@@ -82,8 +82,10 @@ static void tz_mpc_iommu_notify(TZMPC *s, uint32_t lutidx, + /* Called when the LUT word at lutidx has changed from oldlut to newlut; + * must call the IOMMU notifiers for the changed blocks. + */ +- IOMMUTLBEntry entry = { +- .addr_mask = s->blocksize - 1, ++ IOMMUTLBEvent event = { ++ .entry = { ++ .addr_mask = s->blocksize - 1, ++ } + }; + hwaddr addr = lutidx * s->blocksize * 32; + int i; +@@ -100,26 +102,28 @@ static void tz_mpc_iommu_notify(TZMPC *s, uint32_t lutidx, + block_is_ns = newlut & (1 << i); + + trace_tz_mpc_iommu_notify(addr); +- entry.iova = addr; +- entry.translated_addr = addr; ++ event.entry.iova = addr; ++ event.entry.translated_addr = addr; + +- entry.perm = IOMMU_NONE; +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, entry); +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, entry); ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.perm = IOMMU_NONE; ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, event); ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, event); + +- entry.perm = IOMMU_RW; ++ event.type = IOMMU_NOTIFIER_MAP; ++ event.entry.perm = IOMMU_RW; + if (block_is_ns) { +- entry.target_as = &s->blocked_io_as; ++ event.entry.target_as = &s->blocked_io_as; + } else { +- entry.target_as = &s->downstream_as; ++ event.entry.target_as = &s->downstream_as; + } +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, entry); ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, event); + if (block_is_ns) { +- entry.target_as = &s->downstream_as; ++ event.entry.target_as = &s->downstream_as; + } else { +- entry.target_as = &s->blocked_io_as; ++ event.entry.target_as = &s->blocked_io_as; + } +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, entry); ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, event); + } + } + +diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c +index 3d3bcc86496..9d3ec7e2c07 100644 +--- a/hw/ppc/spapr_iommu.c ++++ b/hw/ppc/spapr_iommu.c +@@ -445,7 +445,7 @@ static void spapr_tce_reset(DeviceState *dev) + static target_ulong put_tce_emu(SpaprTceTable *tcet, target_ulong ioba, + target_ulong tce) + { +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift); + unsigned long index = (ioba - tcet->bus_offset) >> tcet->page_shift; + +@@ -457,12 +457,13 @@ static target_ulong put_tce_emu(SpaprTceTable *tcet, target_ulong ioba, + + tcet->table[index] = tce; + +- entry.target_as = &address_space_memory, +- entry.iova = (ioba - tcet->bus_offset) & page_mask; +- entry.translated_addr = tce & page_mask; +- entry.addr_mask = ~page_mask; +- entry.perm = spapr_tce_iommu_access_flags(tce); +- memory_region_notify_iommu(&tcet->iommu, 0, entry); ++ event.entry.target_as = &address_space_memory, ++ event.entry.iova = (ioba - tcet->bus_offset) & page_mask; ++ event.entry.translated_addr = tce & page_mask; ++ event.entry.addr_mask = ~page_mask; ++ event.entry.perm = spapr_tce_iommu_access_flags(tce); ++ event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : IOMMU_NOTIFIER_UNMAP; ++ memory_region_notify_iommu(&tcet->iommu, 0, event); + + return H_SUCCESS; + } +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index 92c7e45df5f..27b189e6d75 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -575,15 +575,18 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + { + S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); +- IOMMUTLBEntry notify = { +- .target_as = &address_space_memory, +- .iova = entry->iova, +- .translated_addr = entry->translated_addr, +- .perm = entry->perm, +- .addr_mask = ~PAGE_MASK, ++ IOMMUTLBEvent event = { ++ .type = entry->perm ? IOMMU_NOTIFIER_MAP : IOMMU_NOTIFIER_UNMAP, ++ .entry = { ++ .target_as = &address_space_memory, ++ .iova = entry->iova, ++ .translated_addr = entry->translated_addr, ++ .perm = entry->perm, ++ .addr_mask = ~PAGE_MASK, ++ }, + }; + +- if (entry->perm == IOMMU_NONE) { ++ if (event.type == IOMMU_NOTIFIER_UNMAP) { + if (!cache) { + return; + } +@@ -595,9 +598,11 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + return; + } + +- notify.perm = IOMMU_NONE; +- memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); +- notify.perm = entry->perm; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.perm = IOMMU_NONE; ++ memory_region_notify_iommu(&iommu->iommu_mr, 0, event); ++ event.type = IOMMU_NOTIFIER_MAP; ++ event.entry.perm = entry->perm; + } + + cache = g_new(S390IOTLBEntry, 1); +@@ -608,7 +613,7 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + g_hash_table_replace(iommu->iotlb, &cache->iova, cache); + } + +- memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); ++ memory_region_notify_iommu(&iommu->iommu_mr, 0, event); + } + + int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) +diff --git a/include/exec/memory.h b/include/exec/memory.h +index b6466ab6d57..80e36077cdb 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -106,6 +106,11 @@ struct IOMMUNotifier { + }; + typedef struct IOMMUNotifier IOMMUNotifier; + ++typedef struct IOMMUTLBEvent { ++ IOMMUNotifierFlag type; ++ IOMMUTLBEntry entry; ++} IOMMUTLBEvent; ++ + /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ + #define RAM_PREALLOC (1 << 0) + +@@ -1047,24 +1052,18 @@ uint64_t memory_region_iommu_get_min_page_size(IOMMUMemoryRegion *iommu_mr); + /** + * memory_region_notify_iommu: notify a change in an IOMMU translation entry. + * +- * The notification type will be decided by entry.perm bits: +- * +- * - For UNMAP (cache invalidation) notifies: set entry.perm to IOMMU_NONE. +- * - For MAP (newly added entry) notifies: set entry.perm to the +- * permission of the page (which is definitely !IOMMU_NONE). +- * + * Note: for any IOMMU implementation, an in-place mapping change + * should be notified with an UNMAP followed by a MAP. + * + * @iommu_mr: the memory region that was changed + * @iommu_idx: the IOMMU index for the translation table which has changed +- * @entry: the new entry in the IOMMU translation table. The entry +- * replaces all old entries for the same virtual I/O address range. +- * Deleted entries have .@perm == 0. ++ * @event: TLB event with the new entry in the IOMMU translation table. ++ * The entry replaces all old entries for the same virtual I/O address ++ * range. + */ + void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, +- IOMMUTLBEntry entry); ++ IOMMUTLBEvent event); + + /** + * memory_region_notify_iommu_one: notify a change in an IOMMU translation +@@ -1074,12 +1073,12 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + * notifies a specific notifier, not all of them. + * + * @notifier: the notifier to be notified +- * @entry: the new entry in the IOMMU translation table. The entry +- * replaces all old entries for the same virtual I/O address range. +- * Deleted entries have .@perm == 0. ++ * @event: TLB event with the new entry in the IOMMU translation table. ++ * The entry replaces all old entries for the same virtual I/O address ++ * range. + */ + void memory_region_notify_iommu_one(IOMMUNotifier *notifier, +- IOMMUTLBEntry *entry); ++ IOMMUTLBEvent *event); + + /** + * memory_region_register_iommu_notifier: register a notifier for changes to +diff --git a/memory.c b/memory.c +index 43bd3359bf8..3bd99b8ac4a 100644 +--- a/memory.c ++++ b/memory.c +@@ -1912,11 +1912,15 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + } + + void memory_region_notify_iommu_one(IOMMUNotifier *notifier, +- IOMMUTLBEntry *entry) ++ IOMMUTLBEvent *event) + { +- IOMMUNotifierFlag request_flags; ++ IOMMUTLBEntry *entry = &event->entry; + hwaddr entry_end = entry->iova + entry->addr_mask; + ++ if (event->type == IOMMU_NOTIFIER_UNMAP) { ++ assert(entry->perm == IOMMU_NONE); ++ } ++ + /* + * Skip the notification if the notification does not overlap + * with registered range. +@@ -1927,20 +1931,14 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + + assert(entry->iova >= notifier->start && entry_end <= notifier->end); + +- if (entry->perm & IOMMU_RW) { +- request_flags = IOMMU_NOTIFIER_MAP; +- } else { +- request_flags = IOMMU_NOTIFIER_UNMAP; +- } +- +- if (notifier->notifier_flags & request_flags) { ++ if (event->type & notifier->notifier_flags) { + notifier->notify(notifier, entry); + } + } + + void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, +- IOMMUTLBEntry entry) ++ IOMMUTLBEvent event) + { + IOMMUNotifier *iommu_notifier; + +@@ -1948,7 +1946,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + + IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { + if (iommu_notifier->iommu_idx == iommu_idx) { +- memory_region_notify_iommu_one(iommu_notifier, &entry); ++ memory_region_notify_iommu_one(iommu_notifier, &event); + } + } + } +-- +2.27.0 + diff --git a/kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch b/kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch new file mode 100755 index 0000000..89eb9c9 --- /dev/null +++ b/kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch @@ -0,0 +1,89 @@ +From f0fa537af2e1e5f827eeb74dc5b3e12776917a67 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:36 -0500 +Subject: [PATCH 12/17] memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP + IOMMUTLBNotificationType +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-12-eperezma@redhat.com> +Patchwork-id: 100604 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 11/13] memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP IOMMUTLBNotificationType +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +This allows us to differentiate between regular IOMMU map/unmap events +and DEVIOTLB unmap. Doing so, notifiers that only need device IOTLB +invalidations will not receive regular IOMMU unmappings. + +Adapt intel and vhost to use it. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-4-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit b68ba1ca57677acf870d5ab10579e6105c1f5338) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/i386/intel_iommu.c | 2 +- + hw/virtio/vhost.c | 2 +- + include/exec/memory.h | 7 ++++++- + 3 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 9fedbac82de..3640bc2ed15 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -2411,7 +2411,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + sz = VTD_PAGE_SIZE; + } + +- event.type = IOMMU_NOTIFIER_UNMAP; ++ event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; + event.entry.target_as = &vtd_dev_as->as; + event.entry.addr_mask = sz - 1; + event.entry.iova = addr; +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 9182a00495e..78a5df3b379 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -704,7 +704,7 @@ static void vhost_iommu_region_add(MemoryListener *listener, + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); + iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, +- IOMMU_NOTIFIER_UNMAP, ++ IOMMU_NOTIFIER_DEVIOTLB_UNMAP, + section->offset_within_region, + int128_get64(end), + iommu_idx); +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 80e36077cdb..403dc0c0572 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -87,9 +87,14 @@ typedef enum { + IOMMU_NOTIFIER_UNMAP = 0x1, + /* Notify entry changes (newly created entries) */ + IOMMU_NOTIFIER_MAP = 0x2, ++ /* Notify changes on device IOTLB entries */ ++ IOMMU_NOTIFIER_DEVIOTLB_UNMAP = 0x04, + } IOMMUNotifierFlag; + +-#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) ++#define IOMMU_NOTIFIER_IOTLB_EVENTS (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) ++#define IOMMU_NOTIFIER_DEVIOTLB_EVENTS IOMMU_NOTIFIER_DEVIOTLB_UNMAP ++#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_IOTLB_EVENTS | \ ++ IOMMU_NOTIFIER_DEVIOTLB_EVENTS) + + struct IOMMUNotifier; + typedef void (*IOMMUNotify)(struct IOMMUNotifier *notifier, +-- +2.27.0 + diff --git a/kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch b/kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch new file mode 100755 index 0000000..8921c14 --- /dev/null +++ b/kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch @@ -0,0 +1,146 @@ +From e876535fd5ed10abf0dbeb55ec7098664412068e Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:34 -0500 +Subject: [PATCH 10/17] memory: Rename memory_region_notify_one to + memory_region_notify_iommu_one +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-10-eperezma@redhat.com> +Patchwork-id: 100602 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 09/13] memory: Rename memory_region_notify_one to memory_region_notify_iommu_one +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +Previous name didn't reflect the iommu operation. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: David Gibson +Reviewed-by: Juan Quintela +Reviewed-by: Eric Auger +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-2-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 3b5ebf8532afdc1518bd8b0961ed802bc3f5f07c) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 2 +- + hw/arm/smmuv3.c | 2 +- + hw/i386/intel_iommu.c | 4 ++-- + include/exec/memory.h | 6 +++--- + memory.c | 6 +++--- + 5 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 9780404f002..dfabe381182 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -471,7 +471,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n) + entry.perm = IOMMU_NONE; + entry.addr_mask = n->end - n->start; + +- memory_region_notify_one(n, &entry); ++ memory_region_notify_iommu_one(n, &entry); + } + + /* Unmap all notifiers attached to @mr */ +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index a418fab2aa6..ef8a877c5d8 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -811,7 +811,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + entry.addr_mask = num_pages * (1 << granule) - 1; + entry.perm = IOMMU_NONE; + +- memory_region_notify_one(n, &entry); ++ memory_region_notify_iommu_one(n, &entry); + } + + /* invalidate an asid/iova range tuple in all mr's */ +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 43c94b993b4..463f107ad12 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -3431,7 +3431,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + /* This field is meaningless for unmap */ + entry.translated_addr = 0; + +- memory_region_notify_one(n, &entry); ++ memory_region_notify_iommu_one(n, &entry); + + start += mask; + remain -= mask; +@@ -3469,7 +3469,7 @@ static void vtd_address_space_refresh_all(IntelIOMMUState *s) + + static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) + { +- memory_region_notify_one((IOMMUNotifier *)private, entry); ++ memory_region_notify_iommu_one((IOMMUNotifier *)private, entry); + return 0; + } + +diff --git a/include/exec/memory.h b/include/exec/memory.h +index e499dc215b3..b6466ab6d57 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -226,7 +226,7 @@ enum IOMMUMemoryRegionAttr { + * The IOMMU implementation must use the IOMMU notifier infrastructure + * to report whenever mappings are changed, by calling + * memory_region_notify_iommu() (or, if necessary, by calling +- * memory_region_notify_one() for each registered notifier). ++ * memory_region_notify_iommu_one() for each registered notifier). + * + * Conceptually an IOMMU provides a mapping from input address + * to an output TLB entry. If the IOMMU is aware of memory transaction +@@ -1067,7 +1067,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + IOMMUTLBEntry entry); + + /** +- * memory_region_notify_one: notify a change in an IOMMU translation ++ * memory_region_notify_iommu_one: notify a change in an IOMMU translation + * entry to a single notifier + * + * This works just like memory_region_notify_iommu(), but it only +@@ -1078,7 +1078,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + * replaces all old entries for the same virtual I/O address range. + * Deleted entries have .@perm == 0. + */ +-void memory_region_notify_one(IOMMUNotifier *notifier, ++void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + IOMMUTLBEntry *entry); + + /** +diff --git a/memory.c b/memory.c +index 06484c2bff2..43bd3359bf8 100644 +--- a/memory.c ++++ b/memory.c +@@ -1911,8 +1911,8 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + memory_region_update_iommu_notify_flags(iommu_mr, NULL); + } + +-void memory_region_notify_one(IOMMUNotifier *notifier, +- IOMMUTLBEntry *entry) ++void memory_region_notify_iommu_one(IOMMUNotifier *notifier, ++ IOMMUTLBEntry *entry) + { + IOMMUNotifierFlag request_flags; + hwaddr entry_end = entry->iova + entry->addr_mask; +@@ -1948,7 +1948,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + + IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { + if (iommu_notifier->iommu_idx == iommu_idx) { +- memory_region_notify_one(iommu_notifier, &entry); ++ memory_region_notify_iommu_one(iommu_notifier, &entry); + } + } + } +-- +2.27.0 + diff --git a/kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch b/kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch new file mode 100755 index 0000000..f81c86f --- /dev/null +++ b/kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch @@ -0,0 +1,104 @@ +From 13f4ebe4708f4f4dc20d710e475a42d520459860 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:03 -0400 +Subject: [PATCH 4/7] memory: Revert "memory: accept mismatching sizes in + memory_region_access_valid" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-4-jmaloy@redhat.com> +Patchwork-id: 101480 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 3/6] memory: Revert "memory: accept mismatching sizes in memory_region_access_valid" +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: "Michael S. Tsirkin" + +Memory API documentation documents valid .min_access_size and .max_access_size +fields and explains that any access outside these boundaries is blocked. + +This is what devices seem to assume. + +However this is not what the implementation does: it simply +ignores the boundaries unless there's an "accepts" callback. + +Naturally, this breaks a bunch of devices. + +Revert to the documented behaviour. + +Devices that want to allow any access can just drop the valid field, +or add the impl field to have accesses converted to appropriate +length. + +Cc: qemu-stable@nongnu.org +Reviewed-by: Richard Henderson +Fixes: CVE-2020-13754 +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1842363 +Fixes: a014ed07bd5a ("memory: accept mismatching sizes in memory_region_access_valid") +Signed-off-by: Michael S. Tsirkin +Message-Id: <20200610134731.1514409-1-mst@redhat.com> +Signed-off-by: Paolo Bonzini + +(cherry picked from commit 5d971f9e672507210e77d020d89e0e89165c8fc9) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + memory.c | 29 +++++++++-------------------- + 1 file changed, 9 insertions(+), 20 deletions(-) + +diff --git a/memory.c b/memory.c +index 5a4a80842d..0cfcb72a5a 100644 +--- a/memory.c ++++ b/memory.c +@@ -1351,35 +1351,24 @@ bool memory_region_access_valid(MemoryRegion *mr, + bool is_write, + MemTxAttrs attrs) + { +- int access_size_min, access_size_max; +- int access_size, i; +- +- if (!mr->ops->valid.unaligned && (addr & (size - 1))) { ++ if (mr->ops->valid.accepts ++ && !mr->ops->valid.accepts(mr->opaque, addr, size, is_write, attrs)) { + return false; + } + +- if (!mr->ops->valid.accepts) { +- return true; +- } +- +- access_size_min = mr->ops->valid.min_access_size; +- if (!mr->ops->valid.min_access_size) { +- access_size_min = 1; ++ if (!mr->ops->valid.unaligned && (addr & (size - 1))) { ++ return false; + } + +- access_size_max = mr->ops->valid.max_access_size; ++ /* Treat zero as compatibility all valid */ + if (!mr->ops->valid.max_access_size) { +- access_size_max = 4; ++ return true; + } + +- access_size = MAX(MIN(size, access_size_max), access_size_min); +- for (i = 0; i < size; i += access_size) { +- if (!mr->ops->valid.accepts(mr->opaque, addr + i, access_size, +- is_write, attrs)) { +- return false; +- } ++ if (size > mr->ops->valid.max_access_size ++ || size < mr->ops->valid.min_access_size) { ++ return false; + } +- + return true; + } + +-- +2.27.0 + diff --git a/kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch b/kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch new file mode 100755 index 0000000..de56901 --- /dev/null +++ b/kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch @@ -0,0 +1,70 @@ +From 8c5154729effda3f762bfb8224f9c61dab8b2986 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:38 -0500 +Subject: [PATCH 14/17] memory: Skip bad range assertion if notifier is + DEVIOTLB_UNMAP type +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-14-eperezma@redhat.com> +Patchwork-id: 100606 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 13/13] memory: Skip bad range assertion if notifier is DEVIOTLB_UNMAP type +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +Device IOTLB invalidations can unmap arbitrary ranges, eiter outside of +the memory region or even [0, ~0ULL] for all the space. The assertion +could be hit by a guest, and rhel7 guest effectively hit it. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-6-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 1804857f19f612f6907832e35599cdb51d4ec764) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + memory.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/memory.c b/memory.c +index 3bd99b8ac4a..5a4a80842d7 100644 +--- a/memory.c ++++ b/memory.c +@@ -1916,6 +1916,7 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + { + IOMMUTLBEntry *entry = &event->entry; + hwaddr entry_end = entry->iova + entry->addr_mask; ++ IOMMUTLBEntry tmp = *entry; + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + assert(entry->perm == IOMMU_NONE); +@@ -1929,10 +1930,16 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + return; + } + +- assert(entry->iova >= notifier->start && entry_end <= notifier->end); ++ if (notifier->notifier_flags & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { ++ /* Crop (iova, addr_mask) to range */ ++ tmp.iova = MAX(tmp.iova, notifier->start); ++ tmp.addr_mask = MIN(entry_end, notifier->end) - tmp.iova; ++ } else { ++ assert(entry->iova >= notifier->start && entry_end <= notifier->end); ++ } + + if (event->type & notifier->notifier_flags) { +- notifier->notify(notifier, entry); ++ notifier->notify(notifier, &tmp); + } + } + +-- +2.27.0 + diff --git a/kvm-memory-clamp-cached-translation-in-case-it-points-to.patch b/kvm-memory-clamp-cached-translation-in-case-it-points-to.patch new file mode 100755 index 0000000..8b8f67a --- /dev/null +++ b/kvm-memory-clamp-cached-translation-in-case-it-points-to.patch @@ -0,0 +1,87 @@ +From 354946f1e5fee0a69282bdf284c969b03a78a53e Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 13 Jan 2021 00:42:23 -0500 +Subject: [PATCH 15/17] memory: clamp cached translation in case it points to + an MMIO region + +RH-Author: Jon Maloy +Message-id: <20210113004223.871394-2-jmaloy@redhat.com> +Patchwork-id: 100618 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] memory: clamp cached translation in case it points to an MMIO region +Bugzilla: 1904393 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Thomas Huth + +From: Paolo Bonzini + +In using the address_space_translate_internal API, address_space_cache_init +forgot one piece of advice that can be found in the code for +address_space_translate_internal: + + /* MMIO registers can be expected to perform full-width accesses based only + * on their address, without considering adjacent registers that could + * decode to completely different MemoryRegions. When such registers + * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO + * regions overlap wildly. For this reason we cannot clamp the accesses + * here. + * + * If the length is small (as is the case for address_space_ldl/stl), + * everything works fine. If the incoming length is large, however, + * the caller really has to do the clamping through memory_access_size. + */ + +address_space_cache_init is exactly one such case where "the incoming length +is large", therefore we need to clamp the resulting length---not to +memory_access_size though, since we are not doing an access yet, but to +the size of the resulting section. This ensures that subsequent accesses +to the cached MemoryRegionSection will be in range. + +With this patch, the enclosed testcase notices that the used ring does +not fit into the MSI-X table and prints a "qemu-system-x86_64: Cannot map used" +error. + +Signed-off-by: Paolo Bonzini + +(cherry picked from 4bfb024bc76973d40a359476dc0291f46e435442) +- Manually applied to file exec.c, where the code to correct + is located in this version. +- Skipped the fuzzing test part, which is hard to apply on this code. +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + exec.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/exec.c b/exec.c +index ffdb5185353..09ed0cfc756 100644 +--- a/exec.c ++++ b/exec.c +@@ -3620,6 +3620,7 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpaceDispatch *d; + hwaddr l; + MemoryRegion *mr; ++ Int128 diff; + + assert(len > 0); + +@@ -3628,6 +3629,15 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, + d = flatview_to_dispatch(cache->fv); + cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true); + ++ /* ++ * cache->xlat is now relative to cache->mrs.mr, not to the section itself. ++ * Take that into account to compute how many bytes are there between ++ * cache->xlat and the end of the section. ++ */ ++ diff = int128_sub(cache->mrs.size, ++ int128_make64(cache->xlat - cache->mrs.offset_within_region)); ++ l = int128_get64(int128_min(diff, int128_make64(l))); ++ + mr = cache->mrs.mr; + memory_region_ref(mr); + if (memory_access_is_direct(mr, is_write)) { +-- +2.27.0 + diff --git a/kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch b/kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch new file mode 100755 index 0000000..3477af5 --- /dev/null +++ b/kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch @@ -0,0 +1,179 @@ +From 38a032829b6b8d523b4cee05f732031e66fc2e41 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 31 Jan 2020 17:12:56 +0000 +Subject: [PATCH 14/15] migration: Change SaveStateEntry.instance_id into + uint32_t + +RH-Author: Peter Xu +Message-id: <20200131171257.1066593-3-peterx@redhat.com> +Patchwork-id: 93629 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/3] migration: Change SaveStateEntry.instance_id into uint32_t +Bugzilla: 1529231 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +It was always used as 32bit, so define it as used to be clear. +Instead of using -1 as the auto-gen magic value, we switch to +UINT32_MAX. We also make sure that we don't auto-gen this value to +avoid overflowed instance IDs without being noticed. + +Suggested-by: Juan Quintela +Signed-off-by: Peter Xu +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 93062e23619e057743757ee53bf7f8e07f7a3710) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + include/migration/vmstate.h + migration/savevm.c + stubs/vmstate.c + Due to missing 3cad405bab ("vmstate: replace DeviceState with + VMStateIf", 2020-01-06) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/apic_common.c | 2 +- + include/migration/register.h | 2 +- + include/migration/vmstate.h | 2 +- + migration/savevm.c | 18 ++++++++++-------- + stubs/vmstate.c | 2 +- + 5 files changed, 14 insertions(+), 12 deletions(-) + +diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c +index f2c3a7f..54b8731 100644 +--- a/hw/intc/apic_common.c ++++ b/hw/intc/apic_common.c +@@ -268,7 +268,7 @@ static void apic_common_realize(DeviceState *dev, Error **errp) + APICCommonState *s = APIC_COMMON(dev); + APICCommonClass *info; + static DeviceState *vapic; +- int instance_id = s->id; ++ uint32_t instance_id = s->id; + + info = APIC_COMMON_GET_CLASS(s); + info->realize(dev, errp); +diff --git a/include/migration/register.h b/include/migration/register.h +index a13359a..f3ba10b 100644 +--- a/include/migration/register.h ++++ b/include/migration/register.h +@@ -69,7 +69,7 @@ typedef struct SaveVMHandlers { + } SaveVMHandlers; + + int register_savevm_live(const char *idstr, +- int instance_id, ++ uint32_t instance_id, + int version_id, + const SaveVMHandlers *ops, + void *opaque); +diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h +index 883f1cf..296609c 100644 +--- a/include/migration/vmstate.h ++++ b/include/migration/vmstate.h +@@ -1158,7 +1158,7 @@ bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); + #define VMSTATE_INSTANCE_ID_ANY -1 + + /* Returns: 0 on success, -1 on failure */ +-int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, ++int vmstate_register_with_alias_id(DeviceState *dev, uint32_t instance_id, + const VMStateDescription *vmsd, + void *base, int alias_id, + int required_for_version, +diff --git a/migration/savevm.c b/migration/savevm.c +index e2e8e0a..a80bb52 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -233,7 +233,7 @@ typedef struct CompatEntry { + typedef struct SaveStateEntry { + QTAILQ_ENTRY(SaveStateEntry) entry; + char idstr[256]; +- int instance_id; ++ uint32_t instance_id; + int alias_id; + int version_id; + /* version id read from the stream */ +@@ -665,10 +665,10 @@ void dump_vmstate_json_to_file(FILE *out_file) + fclose(out_file); + } + +-static int calculate_new_instance_id(const char *idstr) ++static uint32_t calculate_new_instance_id(const char *idstr) + { + SaveStateEntry *se; +- int instance_id = 0; ++ uint32_t instance_id = 0; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (strcmp(idstr, se->idstr) == 0 +@@ -676,6 +676,8 @@ static int calculate_new_instance_id(const char *idstr) + instance_id = se->instance_id + 1; + } + } ++ /* Make sure we never loop over without being noticed */ ++ assert(instance_id != VMSTATE_INSTANCE_ID_ANY); + return instance_id; + } + +@@ -730,7 +732,7 @@ static void savevm_state_handler_insert(SaveStateEntry *nse) + Meanwhile pass -1 as instance_id if you do not already have a clearly + distinguishing id for all instances of your device class. */ + int register_savevm_live(const char *idstr, +- int instance_id, ++ uint32_t instance_id, + int version_id, + const SaveVMHandlers *ops, + void *opaque) +@@ -784,7 +786,7 @@ void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque) + } + } + +-int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, ++int vmstate_register_with_alias_id(DeviceState *dev, uint32_t instance_id, + const VMStateDescription *vmsd, + void *opaque, int alias_id, + int required_for_version, +@@ -1600,7 +1602,7 @@ int qemu_save_device_state(QEMUFile *f) + return qemu_file_get_error(f); + } + +-static SaveStateEntry *find_se(const char *idstr, int instance_id) ++static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id) + { + SaveStateEntry *se; + +@@ -2267,7 +2269,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) + /* Find savevm section */ + se = find_se(idstr, instance_id); + if (se == NULL) { +- error_report("Unknown savevm section or instance '%s' %d. " ++ error_report("Unknown savevm section or instance '%s' %"PRIu32". " + "Make sure that your current VM setup matches your " + "saved VM setup, including any hotplugged devices", + idstr, instance_id); +@@ -2291,7 +2293,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) + + ret = vmstate_load(f, se); + if (ret < 0) { +- error_report("error while loading state for instance 0x%x of" ++ error_report("error while loading state for instance 0x%"PRIx32" of" + " device '%s'", instance_id, idstr); + return ret; + } +diff --git a/stubs/vmstate.c b/stubs/vmstate.c +index e1e89b8..4ed5cc6 100644 +--- a/stubs/vmstate.c ++++ b/stubs/vmstate.c +@@ -4,7 +4,7 @@ + const VMStateDescription vmstate_dummy = {}; + + int vmstate_register_with_alias_id(DeviceState *dev, +- int instance_id, ++ uint32_t instance_id, + const VMStateDescription *vmsd, + void *base, int alias_id, + int required_for_version, +-- +1.8.3.1 + diff --git a/kvm-migration-Create-migration_is_running.patch b/kvm-migration-Create-migration_is_running.patch new file mode 100755 index 0000000..c9593de --- /dev/null +++ b/kvm-migration-Create-migration_is_running.patch @@ -0,0 +1,119 @@ +From c9e3d13d70a24bf606ce351886b27bdca25ef4dc Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:41 +0000 +Subject: [PATCH 09/18] migration: Create migration_is_running() + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-9-quintela@redhat.com> +Patchwork-id: 94115 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 08/10] migration: Create migration_is_running() +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +This function returns true if we are in the middle of a migration. +It is like migration_is_setup_or_active() with CANCELLING and COLO. +Adapt all callers that are needed. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 392d87e21325fdb01210176faa07472b4985ccf0) +Signed-off-by: Danilo C. L. de Paula +--- + migration/migration.c | 29 ++++++++++++++++++++++++----- + migration/migration.h | 1 + + migration/savevm.c | 4 +--- + 3 files changed, 26 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 30c53c6..eb50d77 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -831,6 +831,27 @@ bool migration_is_setup_or_active(int state) + } + } + ++bool migration_is_running(int state) ++{ ++ switch (state) { ++ case MIGRATION_STATUS_ACTIVE: ++ case MIGRATION_STATUS_POSTCOPY_ACTIVE: ++ case MIGRATION_STATUS_POSTCOPY_PAUSED: ++ case MIGRATION_STATUS_POSTCOPY_RECOVER: ++ case MIGRATION_STATUS_SETUP: ++ case MIGRATION_STATUS_PRE_SWITCHOVER: ++ case MIGRATION_STATUS_DEVICE: ++ case MIGRATION_STATUS_WAIT_UNPLUG: ++ case MIGRATION_STATUS_CANCELLING: ++ case MIGRATION_STATUS_COLO: ++ return true; ++ ++ default: ++ return false; ++ ++ } ++} ++ + static void populate_time_info(MigrationInfo *info, MigrationState *s) + { + info->has_status = true; +@@ -1090,7 +1111,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, + MigrationCapabilityStatusList *cap; + bool cap_list[MIGRATION_CAPABILITY__MAX]; + +- if (migration_is_setup_or_active(s->state)) { ++ if (migration_is_running(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return; + } +@@ -1603,7 +1624,7 @@ static void migrate_fd_cancel(MigrationState *s) + + do { + old_state = s->state; +- if (!migration_is_setup_or_active(old_state)) { ++ if (!migration_is_running(old_state)) { + break; + } + /* If the migration is paused, kick it out of the pause */ +@@ -1900,9 +1921,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, + return true; + } + +- if (migration_is_setup_or_active(s->state) || +- s->state == MIGRATION_STATUS_CANCELLING || +- s->state == MIGRATION_STATUS_COLO) { ++ if (migration_is_running(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return false; + } +diff --git a/migration/migration.h b/migration/migration.h +index 0b1b0d4..a2b2336 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -279,6 +279,7 @@ void migrate_fd_error(MigrationState *s, const Error *error); + void migrate_fd_connect(MigrationState *s, Error *error_in); + + bool migration_is_setup_or_active(int state); ++bool migration_is_running(int state); + + void migrate_init(MigrationState *s); + bool migration_is_blocked(Error **errp); +diff --git a/migration/savevm.c b/migration/savevm.c +index a80bb52..144ecf0 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -1506,9 +1506,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) + MigrationState *ms = migrate_get_current(); + MigrationStatus status; + +- if (migration_is_setup_or_active(ms->state) || +- ms->state == MIGRATION_STATUS_CANCELLING || +- ms->state == MIGRATION_STATUS_COLO) { ++ if (migration_is_running(ms->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return -EINVAL; + } +-- +1.8.3.1 + diff --git a/kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch b/kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch new file mode 100755 index 0000000..c2ead53 --- /dev/null +++ b/kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch @@ -0,0 +1,257 @@ +From 2659af9267586fb626f543773bf3f844727e473b Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 31 Jan 2020 17:12:55 +0000 +Subject: [PATCH 13/15] migration: Define VMSTATE_INSTANCE_ID_ANY + +RH-Author: Peter Xu +Message-id: <20200131171257.1066593-2-peterx@redhat.com> +Patchwork-id: 93630 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/3] migration: Define VMSTATE_INSTANCE_ID_ANY +Bugzilla: 1529231 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +Define the new macro VMSTATE_INSTANCE_ID_ANY for callers who wants to +auto-generate the vmstate instance ID. Previously it was hard coded +as -1 instead of this macro. It helps to change this default value in +the follow up patches. No functional change. + +Signed-off-by: Peter Xu +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 1df2c9a26fcb2fa32d099f8e9adcdae4207872e3) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + backends/dbus-vmstate.c + File deleted + hw/core/qdev.c + hw/misc/max111x.c + hw/net/eepro100.c + Due to missing commit 3cad405bab ("vmstate: replace + DeviceState with VMStateIf", 2020-01-06) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/stellaris.c | 2 +- + hw/core/qdev.c | 3 ++- + hw/display/ads7846.c | 2 +- + hw/i2c/core.c | 2 +- + hw/input/stellaris_input.c | 3 ++- + hw/intc/apic_common.c | 2 +- + hw/misc/max111x.c | 2 +- + hw/net/eepro100.c | 2 +- + hw/pci/pci.c | 2 +- + hw/ppc/spapr.c | 2 +- + hw/timer/arm_timer.c | 2 +- + hw/tpm/tpm_emulator.c | 3 ++- + include/migration/vmstate.h | 2 ++ + migration/savevm.c | 8 ++++---- + 14 files changed, 21 insertions(+), 16 deletions(-) + +diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c +index b198066..bb025e0 100644 +--- a/hw/arm/stellaris.c ++++ b/hw/arm/stellaris.c +@@ -708,7 +708,7 @@ static int stellaris_sys_init(uint32_t base, qemu_irq irq, + memory_region_init_io(&s->iomem, NULL, &ssys_ops, s, "ssys", 0x00001000); + memory_region_add_subregion(get_system_memory(), base, &s->iomem); + ssys_reset(s); +- vmstate_register(NULL, -1, &vmstate_stellaris_sys, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_stellaris_sys, s); + return 0; + } + +diff --git a/hw/core/qdev.c b/hw/core/qdev.c +index cf1ba28..40f6b2b 100644 +--- a/hw/core/qdev.c ++++ b/hw/core/qdev.c +@@ -890,7 +890,8 @@ static void device_set_realized(Object *obj, bool value, Error **errp) + dev->canonical_path = object_get_canonical_path(OBJECT(dev)); + + if (qdev_get_vmsd(dev)) { +- if (vmstate_register_with_alias_id(dev, -1, qdev_get_vmsd(dev), dev, ++ if (vmstate_register_with_alias_id(dev, VMSTATE_INSTANCE_ID_ANY, ++ qdev_get_vmsd(dev), dev, + dev->instance_id_alias, + dev->alias_required_for_version, + &local_err) < 0) { +diff --git a/hw/display/ads7846.c b/hw/display/ads7846.c +index c12272a..9228b40 100644 +--- a/hw/display/ads7846.c ++++ b/hw/display/ads7846.c +@@ -154,7 +154,7 @@ static void ads7846_realize(SSISlave *d, Error **errp) + + ads7846_int_update(s); + +- vmstate_register(NULL, -1, &vmstate_ads7846, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_ads7846, s); + } + + static void ads7846_class_init(ObjectClass *klass, void *data) +diff --git a/hw/i2c/core.c b/hw/i2c/core.c +index 92cd489..d770035 100644 +--- a/hw/i2c/core.c ++++ b/hw/i2c/core.c +@@ -61,7 +61,7 @@ I2CBus *i2c_init_bus(DeviceState *parent, const char *name) + + bus = I2C_BUS(qbus_create(TYPE_I2C_BUS, parent, name)); + QLIST_INIT(&bus->current_devs); +- vmstate_register(NULL, -1, &vmstate_i2c_bus, bus); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_i2c_bus, bus); + return bus; + } + +diff --git a/hw/input/stellaris_input.c b/hw/input/stellaris_input.c +index 59892b0..e6ee5e1 100644 +--- a/hw/input/stellaris_input.c ++++ b/hw/input/stellaris_input.c +@@ -88,5 +88,6 @@ void stellaris_gamepad_init(int n, qemu_irq *irq, const int *keycode) + } + s->num_buttons = n; + qemu_add_kbd_event_handler(stellaris_gamepad_put_key, s); +- vmstate_register(NULL, -1, &vmstate_stellaris_gamepad, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, ++ &vmstate_stellaris_gamepad, s); + } +diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c +index 375cb6a..f2c3a7f 100644 +--- a/hw/intc/apic_common.c ++++ b/hw/intc/apic_common.c +@@ -284,7 +284,7 @@ static void apic_common_realize(DeviceState *dev, Error **errp) + } + + if (s->legacy_instance_id) { +- instance_id = -1; ++ instance_id = VMSTATE_INSTANCE_ID_ANY; + } + vmstate_register_with_alias_id(NULL, instance_id, &vmstate_apic_common, + s, -1, 0, NULL); +diff --git a/hw/misc/max111x.c b/hw/misc/max111x.c +index a713149..81ee73e 100644 +--- a/hw/misc/max111x.c ++++ b/hw/misc/max111x.c +@@ -146,7 +146,7 @@ static int max111x_init(SSISlave *d, int inputs) + s->input[7] = 0x80; + s->com = 0; + +- vmstate_register(dev, -1, &vmstate_max111x, s); ++ vmstate_register(dev, VMSTATE_INSTANCE_ID_ANY, &vmstate_max111x, s); + return 0; + } + +diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c +index cc2dd8b..39920c6 100644 +--- a/hw/net/eepro100.c ++++ b/hw/net/eepro100.c +@@ -1874,7 +1874,7 @@ static void e100_nic_realize(PCIDevice *pci_dev, Error **errp) + + s->vmstate = g_memdup(&vmstate_eepro100, sizeof(vmstate_eepro100)); + s->vmstate->name = qemu_get_queue(s->nic)->model; +- vmstate_register(&pci_dev->qdev, -1, s->vmstate, s); ++ vmstate_register(&pci_dev->qdev, VMSTATE_INSTANCE_ID_ANY, s->vmstate, s); + } + + static void eepro100_instance_init(Object *obj) +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index cbc7a32..fed019d 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -124,7 +124,7 @@ static void pci_bus_realize(BusState *qbus, Error **errp) + bus->machine_done.notify = pcibus_machine_done; + qemu_add_machine_init_done_notifier(&bus->machine_done); + +- vmstate_register(NULL, -1, &vmstate_pcibus, bus); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_pcibus, bus); + } + + static void pcie_bus_realize(BusState *qbus, Error **errp) +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 8749c72..c12862d 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -3028,7 +3028,7 @@ static void spapr_machine_init(MachineState *machine) + * interface, this is a legacy from the sPAPREnvironment structure + * which predated MachineState but had a similar function */ + vmstate_register(NULL, 0, &vmstate_spapr, spapr); +- register_savevm_live("spapr/htab", -1, 1, ++ register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1, + &savevm_htab_handlers, spapr); + + qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine), +diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c +index af524fa..beaa285 100644 +--- a/hw/timer/arm_timer.c ++++ b/hw/timer/arm_timer.c +@@ -180,7 +180,7 @@ static arm_timer_state *arm_timer_init(uint32_t freq) + s->control = TIMER_CTRL_IE; + + s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_DEFAULT); +- vmstate_register(NULL, -1, &vmstate_arm_timer, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_arm_timer, s); + return s; + } + +diff --git a/hw/tpm/tpm_emulator.c b/hw/tpm/tpm_emulator.c +index 22f9113..da7b490 100644 +--- a/hw/tpm/tpm_emulator.c ++++ b/hw/tpm/tpm_emulator.c +@@ -914,7 +914,8 @@ static void tpm_emulator_inst_init(Object *obj) + tpm_emu->cur_locty_number = ~0; + qemu_mutex_init(&tpm_emu->mutex); + +- vmstate_register(NULL, -1, &vmstate_tpm_emulator, obj); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, ++ &vmstate_tpm_emulator, obj); + } + + /* +diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h +index ac4f46a..883f1cf 100644 +--- a/include/migration/vmstate.h ++++ b/include/migration/vmstate.h +@@ -1155,6 +1155,8 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, + + bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); + ++#define VMSTATE_INSTANCE_ID_ANY -1 ++ + /* Returns: 0 on success, -1 on failure */ + int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, + const VMStateDescription *vmsd, +diff --git a/migration/savevm.c b/migration/savevm.c +index a71b930..e2e8e0a 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -750,7 +750,7 @@ int register_savevm_live(const char *idstr, + + pstrcat(se->idstr, sizeof(se->idstr), idstr); + +- if (instance_id == -1) { ++ if (instance_id == VMSTATE_INSTANCE_ID_ANY) { + se->instance_id = calculate_new_instance_id(se->idstr); + } else { + se->instance_id = instance_id; +@@ -817,14 +817,14 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, + + se->compat = g_new0(CompatEntry, 1); + pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); +- se->compat->instance_id = instance_id == -1 ? ++ se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ? + calculate_compat_instance_id(vmsd->name) : instance_id; +- instance_id = -1; ++ instance_id = VMSTATE_INSTANCE_ID_ANY; + } + } + pstrcat(se->idstr, sizeof(se->idstr), vmsd->name); + +- if (instance_id == -1) { ++ if (instance_id == VMSTATE_INSTANCE_ID_ANY) { + se->instance_id = calculate_new_instance_id(se->idstr); + } else { + se->instance_id = instance_id; +-- +1.8.3.1 + diff --git a/kvm-migration-Don-t-send-data-if-we-have-stopped.patch b/kvm-migration-Don-t-send-data-if-we-have-stopped.patch new file mode 100755 index 0000000..9a36714 --- /dev/null +++ b/kvm-migration-Don-t-send-data-if-we-have-stopped.patch @@ -0,0 +1,42 @@ +From ab07e0b41c50a85940d798a9a65a58698fd2edfb Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:40 +0000 +Subject: [PATCH 08/18] migration: Don't send data if we have stopped + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-8-quintela@redhat.com> +Patchwork-id: 94114 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 07/10] migration: Don't send data if we have stopped +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +If we do a cancel, we got out without one error, but we can't do the +rest of the output as in a normal situation. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit b69a0227a803256ad270283872d40ff768f4d56d) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index a0257ee..902c56c 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3511,7 +3511,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_ROUND); + + out: +- if (ret >= 0) { ++ if (ret >= 0 ++ && migration_is_setup_or_active(migrate_get_current()->state)) { + multifd_send_sync_main(rs); + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + qemu_fflush(f); +-- +1.8.3.1 + diff --git a/kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch b/kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch new file mode 100755 index 0000000..01cb0f1 --- /dev/null +++ b/kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch @@ -0,0 +1,94 @@ +From 71b05ab5782aa1e38c016be6264a14f5650d2a87 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:35 +0000 +Subject: [PATCH 03/18] migration: Make sure that we don't call write() in case + of error + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-3-quintela@redhat.com> +Patchwork-id: 94113 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 02/10] migration: Make sure that we don't call write() in case of error +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +If we are exiting due to an error/finish/.... Just don't try to even +touch the channel with one IO operation. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Juan Quintela +(cherry picked from commit 4d65a6216bfc44891ac298b74a6921d479805131) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/migration/ram.c b/migration/ram.c +index 65580e3..8c783b3 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -899,6 +899,12 @@ struct { + uint64_t packet_num; + /* send channels ready */ + QemuSemaphore channels_ready; ++ /* ++ * Have we already run terminate threads. There is a race when it ++ * happens that we got one error while we are exiting. ++ * We will use atomic operations. Only valid values are 0 and 1. ++ */ ++ int exiting; + } *multifd_send_state; + + /* +@@ -927,6 +933,10 @@ static int multifd_send_pages(RAMState *rs) + MultiFDPages_t *pages = multifd_send_state->pages; + uint64_t transferred; + ++ if (atomic_read(&multifd_send_state->exiting)) { ++ return -1; ++ } ++ + qemu_sem_wait(&multifd_send_state->channels_ready); + for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { + p = &multifd_send_state->params[i]; +@@ -1008,6 +1018,16 @@ static void multifd_send_terminate_threads(Error *err) + } + } + ++ /* ++ * We don't want to exit each threads twice. Depending on where ++ * we get the error, or if there are two independent errors in two ++ * threads at the same time, we can end calling this function ++ * twice. ++ */ ++ if (atomic_xchg(&multifd_send_state->exiting, 1)) { ++ return; ++ } ++ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +@@ -1117,6 +1137,10 @@ static void *multifd_send_thread(void *opaque) + + while (true) { + qemu_sem_wait(&p->sem); ++ ++ if (atomic_read(&multifd_send_state->exiting)) { ++ break; ++ } + qemu_mutex_lock(&p->mutex); + + if (p->pending_job) { +@@ -1225,6 +1249,7 @@ int multifd_save_setup(void) + multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); + multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_ready, 0); ++ atomic_set(&multifd_send_state->exiting, 0); + + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; +-- +1.8.3.1 + diff --git a/kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch b/kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch new file mode 100755 index 0000000..4a7fb28 --- /dev/null +++ b/kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch @@ -0,0 +1,70 @@ +From 3c4f6f0c2bf5562f2aa26f964848ae53e6ac4790 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:43 +0000 +Subject: [PATCH 11/18] migration: Maybe VM is paused when migration is + cancelled + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-11-quintela@redhat.com> +Patchwork-id: 94120 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 10/10] migration: Maybe VM is paused when migration is cancelled +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Zhimin Feng + +If the migration is cancelled when it is in the completion phase, +the migration state is set to MIGRATION_STATUS_CANCELLING. +The VM maybe wait for the 'pause_sem' semaphore in migration_maybe_pause +function, so that VM always is paused. + +Reported-by: Euler Robot +Signed-off-by: Zhimin Feng +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 8958338b10abcb346b54a8038a491fda2db1c853) +Signed-off-by: Danilo C. L. de Paula +--- + migration/migration.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index eb50d77..ed18c59 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -2786,14 +2786,22 @@ static int migration_maybe_pause(MigrationState *s, + /* This block intentionally left blank */ + } + +- qemu_mutex_unlock_iothread(); +- migrate_set_state(&s->state, *current_active_state, +- MIGRATION_STATUS_PRE_SWITCHOVER); +- qemu_sem_wait(&s->pause_sem); +- migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, +- new_state); +- *current_active_state = new_state; +- qemu_mutex_lock_iothread(); ++ /* ++ * If the migration is cancelled when it is in the completion phase, ++ * the migration state is set to MIGRATION_STATUS_CANCELLING. ++ * So we don't need to wait a semaphore, otherwise we would always ++ * wait for the 'pause_sem' semaphore. ++ */ ++ if (s->state != MIGRATION_STATUS_CANCELLING) { ++ qemu_mutex_unlock_iothread(); ++ migrate_set_state(&s->state, *current_active_state, ++ MIGRATION_STATUS_PRE_SWITCHOVER); ++ qemu_sem_wait(&s->pause_sem); ++ migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, ++ new_state); ++ *current_active_state = new_state; ++ qemu_mutex_lock_iothread(); ++ } + + return s->state == new_state ? 0 : -EINVAL; + } +-- +1.8.3.1 + diff --git a/kvm-migration-Rate-limit-inside-host-pages.patch b/kvm-migration-Rate-limit-inside-host-pages.patch new file mode 100755 index 0000000..2d3d519 --- /dev/null +++ b/kvm-migration-Rate-limit-inside-host-pages.patch @@ -0,0 +1,172 @@ +From 8e8f421cce99543081f225acf46541312cfbc371 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Tue, 17 Mar 2020 17:05:18 +0000 +Subject: [PATCH 1/2] migration: Rate limit inside host pages + +RH-Author: Laurent Vivier +Message-id: <20200317170518.9303-1-lvivier@redhat.com> +Patchwork-id: 94374 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] migration: Rate limit inside host pages +Bugzilla: 1814336 +RH-Acked-by: Peter Xu +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +From: "Dr. David Alan Gilbert" + +When using hugepages, rate limiting is necessary within each huge +page, since a 1G huge page can take a significant time to send, so +you end up with bursty behaviour. + +Fixes: 4c011c37ecb3 ("postcopy: Send whole huge pages") +Reported-by: Lin Ma +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Juan Quintela +Reviewed-by: Peter Xu +Signed-off-by: Juan Quintela +(cherry picked from commit 97e1e06780e70f6e98a0d2df881e0c0927d3aeb6) +Signed-off-by: Laurent Vivier + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1814336 +BRANCH: rhel-av-8.2.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27283241 +TESTED: Tested that the migration abort doesn't trigger an error message in + the kernel logs on P9 + +Signed-off-by: Danilo C. L. de Paula +--- + migration/migration.c | 57 ++++++++++++++++++++++++++++---------------------- + migration/migration.h | 1 + + migration/ram.c | 2 ++ + migration/trace-events | 4 ++-- + 4 files changed, 37 insertions(+), 27 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index ed18c59..e31d0f5 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3253,6 +3253,37 @@ void migration_consume_urgent_request(void) + qemu_sem_wait(&migrate_get_current()->rate_limit_sem); + } + ++/* Returns true if the rate limiting was broken by an urgent request */ ++bool migration_rate_limit(void) ++{ ++ int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); ++ MigrationState *s = migrate_get_current(); ++ ++ bool urgent = false; ++ migration_update_counters(s, now); ++ if (qemu_file_rate_limit(s->to_dst_file)) { ++ /* ++ * Wait for a delay to do rate limiting OR ++ * something urgent to post the semaphore. ++ */ ++ int ms = s->iteration_start_time + BUFFER_DELAY - now; ++ trace_migration_rate_limit_pre(ms); ++ if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { ++ /* ++ * We were woken by one or more urgent things but ++ * the timedwait will have consumed one of them. ++ * The service routine for the urgent wake will dec ++ * the semaphore itself for each item it consumes, ++ * so add this one we just eat back. ++ */ ++ qemu_sem_post(&s->rate_limit_sem); ++ urgent = true; ++ } ++ trace_migration_rate_limit_post(urgent); ++ } ++ return urgent; ++} ++ + /* + * Master migration thread on the source VM. + * It drives the migration and pumps the data down the outgoing channel. +@@ -3319,8 +3350,6 @@ static void *migration_thread(void *opaque) + trace_migration_thread_setup_complete(); + + while (migration_is_active(s)) { +- int64_t current_time; +- + if (urgent || !qemu_file_rate_limit(s->to_dst_file)) { + MigIterateState iter_state = migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { +@@ -3347,29 +3376,7 @@ static void *migration_thread(void *opaque) + update_iteration_initial_status(s); + } + +- current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +- +- migration_update_counters(s, current_time); +- +- urgent = false; +- if (qemu_file_rate_limit(s->to_dst_file)) { +- /* Wait for a delay to do rate limiting OR +- * something urgent to post the semaphore. +- */ +- int ms = s->iteration_start_time + BUFFER_DELAY - current_time; +- trace_migration_thread_ratelimit_pre(ms); +- if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { +- /* We were worken by one or more urgent things but +- * the timedwait will have consumed one of them. +- * The service routine for the urgent wake will dec +- * the semaphore itself for each item it consumes, +- * so add this one we just eat back. +- */ +- qemu_sem_post(&s->rate_limit_sem); +- urgent = true; +- } +- trace_migration_thread_ratelimit_post(urgent); +- } ++ urgent = migration_rate_limit(); + } + + trace_migration_thread_after_loop(); +diff --git a/migration/migration.h b/migration/migration.h +index a2b2336..a15e8d8 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -347,5 +347,6 @@ extern bool migrate_pre_2_2; + + void migration_make_urgent_request(void); + void migration_consume_urgent_request(void); ++bool migration_rate_limit(void); + + #endif +diff --git a/migration/ram.c b/migration/ram.c +index 3891eff..5344c7d 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -2661,6 +2661,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, + + pages += tmppages; + pss->page++; ++ /* Allow rate limiting to happen in the middle of huge pages */ ++ migration_rate_limit(); + } while ((pss->page & (pagesize_bits - 1)) && + offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); + +diff --git a/migration/trace-events b/migration/trace-events +index 6dee7b5..2f9129e 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -138,12 +138,12 @@ migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi6 + migration_completion_file_err(void) "" + migration_completion_postcopy_end(void) "" + migration_completion_postcopy_end_after_complete(void) "" ++migration_rate_limit_pre(int ms) "%d ms" ++migration_rate_limit_post(int urgent) "urgent: %d" + migration_return_path_end_before(void) "" + migration_return_path_end_after(int rp_error) "%d" + migration_thread_after_loop(void) "" + migration_thread_file_err(void) "" +-migration_thread_ratelimit_pre(int ms) "%d ms" +-migration_thread_ratelimit_post(int urgent) "urgent: %d" + migration_thread_setup_complete(void) "" + open_return_path_on_source(void) "" + open_return_path_on_source_continue(void) "" +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-clean-pages-after-filling-packet.patch b/kvm-migration-multifd-clean-pages-after-filling-packet.patch new file mode 100755 index 0000000..5fa7fde --- /dev/null +++ b/kvm-migration-multifd-clean-pages-after-filling-packet.patch @@ -0,0 +1,65 @@ +From 32ee75b7f4a31d6080e5659e2a0285a046ef1036 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:34 +0000 +Subject: [PATCH 02/18] migration/multifd: clean pages after filling packet + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-2-quintela@redhat.com> +Patchwork-id: 94112 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 01/10] migration/multifd: clean pages after filling packet +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Wei Yang + +This is a preparation for the next patch: + + not use multifd during postcopy. + +Without enabling postcopy, everything looks good. While after enabling +postcopy, migration may fail even not use multifd during postcopy. The +reason is the pages is not properly cleared and *old* target page will +continue to be transferred. + +After clean pages, migration succeeds. + +Signed-off-by: Wei Yang +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit eab54aa78ffd9fb7895b20fc2761ee998479489b) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 5078f94..65580e3 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -944,10 +944,10 @@ static int multifd_send_pages(RAMState *rs) + } + qemu_mutex_unlock(&p->mutex); + } +- p->pages->used = 0; ++ assert(!p->pages->used); ++ assert(!p->pages->block); + + p->packet_num = multifd_send_state->packet_num++; +- p->pages->block = NULL; + multifd_send_state->pages = p->pages; + p->pages = pages; + transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len; +@@ -1129,6 +1129,8 @@ static void *multifd_send_thread(void *opaque) + p->flags = 0; + p->num_packets++; + p->num_pages += used; ++ p->pages->used = 0; ++ p->pages->block = NULL; + qemu_mutex_unlock(&p->mutex); + + trace_multifd_send(p->id, packet_num, used, flags, +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch b/kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch new file mode 100755 index 0000000..0c5fe80 --- /dev/null +++ b/kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch @@ -0,0 +1,77 @@ +From 2c14a6831954a59256cc8d1980da0ad705a3a3fa Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:37 +0000 +Subject: [PATCH 05/18] migration/multifd: fix destroyed mutex access in + terminating multifd threads + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-5-quintela@redhat.com> +Patchwork-id: 94119 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 04/10] migration/multifd: fix destroyed mutex access in terminating multifd threads +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Jiahui Cen + +One multifd will lock all the other multifds' IOChannel mutex to inform them +to quit by setting p->quit or shutting down p->c. In this senario, if some +multifds had already been terminated and multifd_load_cleanup/multifd_save_cleanup +had destroyed their mutex, it could cause destroyed mutex access when trying +lock their mutex. + +Here is the coredump stack: + #0 0x00007f81a2794437 in raise () from /usr/lib64/libc.so.6 + #1 0x00007f81a2795b28 in abort () from /usr/lib64/libc.so.6 + #2 0x00007f81a278d1b6 in __assert_fail_base () from /usr/lib64/libc.so.6 + #3 0x00007f81a278d262 in __assert_fail () from /usr/lib64/libc.so.6 + #4 0x000055eb1bfadbd3 in qemu_mutex_lock_impl (mutex=0x55eb1e2d1988, file=, line=) at util/qemu-thread-posix.c:64 + #5 0x000055eb1bb4564a in multifd_send_terminate_threads (err=) at migration/ram.c:1015 + #6 0x000055eb1bb4bb7f in multifd_send_thread (opaque=0x55eb1e2d19f8) at migration/ram.c:1171 + #7 0x000055eb1bfad628 in qemu_thread_start (args=0x55eb1e170450) at util/qemu-thread-posix.c:502 + #8 0x00007f81a2b36df5 in start_thread () from /usr/lib64/libpthread.so.0 + #9 0x00007f81a286048d in clone () from /usr/lib64/libc.so.6 + +To fix it up, let's destroy the mutex after all the other multifd threads had +been terminated. + +Signed-off-by: Jiahui Cen +Signed-off-by: Ying Fang +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 9560a48ecc0c20d87bc458a6db77fba651605819) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/migration/ram.c b/migration/ram.c +index 860f781..6c55c5d 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1052,6 +1052,10 @@ void multifd_save_cleanup(void) + if (p->running) { + qemu_thread_join(&p->thread); + } ++ } ++ for (i = 0; i < migrate_multifd_channels(); i++) { ++ MultiFDSendParams *p = &multifd_send_state->params[i]; ++ + socket_send_channel_destroy(p->c); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); +@@ -1335,6 +1339,10 @@ int multifd_load_cleanup(Error **errp) + qemu_sem_post(&p->sem_sync); + qemu_thread_join(&p->thread); + } ++ } ++ for (i = 0; i < migrate_multifd_channels(); i++) { ++ MultiFDRecvParams *p = &multifd_recv_state->params[i]; ++ + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch b/kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch new file mode 100755 index 0000000..9e9683c --- /dev/null +++ b/kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch @@ -0,0 +1,75 @@ +From 517a99c5fba163bf684978fe3d9476b619481391 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:42 +0000 +Subject: [PATCH 10/18] migration/multifd: fix nullptr access in + multifd_send_terminate_threads + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-10-quintela@redhat.com> +Patchwork-id: 94117 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 09/10] migration/multifd: fix nullptr access in multifd_send_terminate_threads +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Zhimin Feng + +If the multifd_send_threads is not created when migration is failed, +multifd_save_cleanup would be called twice. In this senario, the +multifd_send_state is accessed after it has been released, the result +is that the source VM is crashing down. + +Here is the coredump stack: + Program received signal SIGSEGV, Segmentation fault. + 0x00005629333a78ef in multifd_send_terminate_threads (err=err@entry=0x0) at migration/ram.c:1012 + 1012 MultiFDSendParams *p = &multifd_send_state->params[i]; + #0 0x00005629333a78ef in multifd_send_terminate_threads (err=err@entry=0x0) at migration/ram.c:1012 + #1 0x00005629333ab8a9 in multifd_save_cleanup () at migration/ram.c:1028 + #2 0x00005629333abaea in multifd_new_send_channel_async (task=0x562935450e70, opaque=) at migration/ram.c:1202 + #3 0x000056293373a562 in qio_task_complete (task=task@entry=0x562935450e70) at io/task.c:196 + #4 0x000056293373a6e0 in qio_task_thread_result (opaque=0x562935450e70) at io/task.c:111 + #5 0x00007f475d4d75a7 in g_idle_dispatch () from /usr/lib64/libglib-2.0.so.0 + #6 0x00007f475d4da9a9 in g_main_context_dispatch () from /usr/lib64/libglib-2.0.so.0 + #7 0x0000562933785b33 in glib_pollfds_poll () at util/main-loop.c:219 + #8 os_host_main_loop_wait (timeout=) at util/main-loop.c:242 + #9 main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:518 + #10 0x00005629334c5acf in main_loop () at vl.c:1810 + #11 0x000056293334d7bb in main (argc=, argv=, envp=) at vl.c:4471 + +If the multifd_send_threads is not created when migration is failed. +In this senario, we don't call multifd_save_cleanup in multifd_new_send_channel_async. + +Signed-off-by: Zhimin Feng +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 9c4d333c092e9c26d38f740ff3616deb42f21681) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 902c56c..3891eff 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1229,7 +1229,15 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + trace_multifd_new_send_channel_async(p->id); + if (qio_task_propagate_error(task, &local_err)) { + migrate_set_error(migrate_get_current(), local_err); +- multifd_save_cleanup(); ++ /* Error happen, we need to tell who pay attention to me */ ++ qemu_sem_post(&multifd_send_state->channels_ready); ++ qemu_sem_post(&p->sem_sync); ++ /* ++ * Although multifd_send_thread is not created, but main migration ++ * thread neet to judge whether it is running, so we need to mark ++ * its status. ++ */ ++ p->quit = true; + } else { + p->c = QIO_CHANNEL(sioc); + qio_channel_set_delay(p->c, false); +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch b/kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch new file mode 100755 index 0000000..e780698 --- /dev/null +++ b/kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch @@ -0,0 +1,68 @@ +From 7f664fe26ff67f8131faa7a81a388b8a5b51403f Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:36 +0000 +Subject: [PATCH 04/18] migration/multifd: fix nullptr access in terminating + multifd threads + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-4-quintela@redhat.com> +Patchwork-id: 94110 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 03/10] migration/multifd: fix nullptr access in terminating multifd threads +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Jiahui Cen + +One multifd channel will shutdown all the other multifd's IOChannel when it +fails to receive an IOChannel. In this senario, if some multifds had not +received its IOChannel yet, it would try to shutdown its IOChannel which could +cause nullptr access at qio_channel_shutdown. + +Here is the coredump stack: + #0 object_get_class (obj=obj@entry=0x0) at qom/object.c:908 + #1 0x00005563fdbb8f4a in qio_channel_shutdown (ioc=0x0, how=QIO_CHANNEL_SHUTDOWN_BOTH, errp=0x0) at io/channel.c:355 + #2 0x00005563fd7b4c5f in multifd_recv_terminate_threads (err=) at migration/ram.c:1280 + #3 0x00005563fd7bc019 in multifd_recv_new_channel (ioc=ioc@entry=0x556400255610, errp=errp@entry=0x7ffec07dce00) at migration/ram.c:1478 + #4 0x00005563fda82177 in migration_ioc_process_incoming (ioc=ioc@entry=0x556400255610, errp=errp@entry=0x7ffec07dce30) at migration/migration.c:605 + #5 0x00005563fda8567d in migration_channel_process_incoming (ioc=0x556400255610) at migration/channel.c:44 + #6 0x00005563fda83ee0 in socket_accept_incoming_migration (listener=0x5563fff6b920, cioc=0x556400255610, opaque=) at migration/socket.c:166 + #7 0x00005563fdbc25cd in qio_net_listener_channel_func (ioc=, condition=, opaque=) at io/net-listener.c:54 + #8 0x00007f895b6fe9a9 in g_main_context_dispatch () from /usr/lib64/libglib-2.0.so.0 + #9 0x00005563fdc18136 in glib_pollfds_poll () at util/main-loop.c:218 + #10 0x00005563fdc181b5 in os_host_main_loop_wait (timeout=1000000000) at util/main-loop.c:241 + #11 0x00005563fdc183a2 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:517 + #12 0x00005563fd8edb37 in main_loop () at vl.c:1791 + #13 0x00005563fd74fd45 in main (argc=, argv=, envp=) at vl.c:4473 + +To fix it up, let's check p->c before calling qio_channel_shutdown. + +Signed-off-by: Jiahui Cen +Signed-off-by: Ying Fang +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit f76e32eb05041ab001184ab16afb56524adccd0c) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 8c783b3..860f781 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1307,7 +1307,9 @@ static void multifd_recv_terminate_threads(Error *err) + - normal quit, i.e. everything went fine, just finished + - error quit: We close the channels so the channel threads + finish the qio_channel_read_all_eof() */ +- qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); ++ if (p->c) { ++ qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); ++ } + qemu_mutex_unlock(&p->mutex); + } + } +-- +1.8.3.1 + diff --git a/kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch b/kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch new file mode 100755 index 0000000..c20cb6c --- /dev/null +++ b/kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch @@ -0,0 +1,123 @@ +From 261ee33e0e6711fadd3049e4640bb731ee3d44ff Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 24 Feb 2020 16:57:10 +0000 +Subject: [PATCH 9/9] mirror: Don't let an operation wait for itself + +RH-Author: Kevin Wolf +Message-id: <20200224165710.4830-3-kwolf@redhat.com> +Patchwork-id: 94045 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] mirror: Don't let an operation wait for itself +Bugzilla: 1794692 +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +mirror_wait_for_free_in_flight_slot() just picks a random operation to +wait for. However, when mirror_co_read() waits for free slots, its +MirrorOp is already in s->ops_in_flight, so if not enough slots are +immediately available, an operation can end up waiting for itself to +complete, which results in a hang. + +Fix this by passing the current MirrorOp and skipping this operation +when picking an operation to wait for. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +(cherry picked from commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 8959e42..cacbc70 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -283,11 +283,14 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, + } + + static inline void coroutine_fn +-mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) ++mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) + { + MirrorOp *op; + + QTAILQ_FOREACH(op, &s->ops_in_flight, next) { ++ if (self == op) { ++ continue; ++ } + /* Do not wait on pseudo ops, because it may in turn wait on + * some other operation to start, which may in fact be the + * caller of this function. Since there is only one pseudo op +@@ -302,10 +305,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) + } + + static inline void coroutine_fn +-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) ++mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self) + { + /* Only non-active operations use up in-flight slots */ +- mirror_wait_for_any_operation(s, false); ++ mirror_wait_for_any_operation(s, self, false); + } + + /* Perform a mirror copy operation. +@@ -348,7 +351,7 @@ static void coroutine_fn mirror_co_read(void *opaque) + + while (s->buf_free_count < nb_chunks) { + trace_mirror_yield_in_flight(s, op->offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, op); + } + + /* Now make a QEMUIOVector taking enough granularity-sized chunks +@@ -555,7 +558,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) + + while (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield_in_flight(s, offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, pseudo_op); + } + + if (s->ret < 0) { +@@ -609,7 +612,7 @@ static void mirror_free_init(MirrorBlockJob *s) + static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) + { + while (s->in_flight > 0) { +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, NULL); + } + } + +@@ -794,7 +797,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) + if (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, + s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, NULL); + continue; + } + +@@ -947,7 +950,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + /* Do not start passive operations while there are active + * writes in progress */ + while (s->in_active_write_counter) { +- mirror_wait_for_any_operation(s, true); ++ mirror_wait_for_any_operation(s, NULL, true); + } + + if (s->ret < 0) { +@@ -973,7 +976,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, NULL); + continue; + } else if (cnt != 0) { + delay_ns = mirror_iteration(s); +-- +1.8.3.1 + diff --git a/kvm-mirror-Make-sure-that-source-and-target-size-match.patch b/kvm-mirror-Make-sure-that-source-and-target-size-match.patch new file mode 100755 index 0000000..09d1152 --- /dev/null +++ b/kvm-mirror-Make-sure-that-source-and-target-size-match.patch @@ -0,0 +1,89 @@ +From 98bf67db979927a5c7bbdc4a17c35d60b5f38e71 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:24 +0100 +Subject: [PATCH 25/26] mirror: Make sure that source and target size match + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-11-kwolf@redhat.com> +Patchwork-id: 97110 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 10/11] mirror: Make sure that source and target size match +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +If the target is shorter than the source, mirror would copy data until +it reaches the end of the target and then fail with an I/O error when +trying to write past the end. + +If the target is longer than the source, the mirror job would complete +successfully, but the target wouldn't actually be an accurate copy of +the source image (it would contain some additional garbage at the end). + +Fix this by checking that both images have the same size when the job +starts. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200511135825.219437-4-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit e83dd6808c6e0975970f37b49b27cc37bb54eea8) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 5e5a521..0d32fca 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -859,6 +859,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + BlockDriverState *target_bs = blk_bs(s->target); + bool need_drain = true; + int64_t length; ++ int64_t target_length; + BlockDriverInfo bdi; + char backing_filename[2]; /* we only need 2 characters because we are only + checking for a NULL string */ +@@ -874,24 +875,26 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + goto immediate_exit; + } + ++ target_length = blk_getlength(s->target); ++ if (target_length < 0) { ++ ret = target_length; ++ goto immediate_exit; ++ } ++ + /* Active commit must resize the base image if its size differs from the + * active layer. */ + if (s->base == blk_bs(s->target)) { +- int64_t base_length; +- +- base_length = blk_getlength(s->target); +- if (base_length < 0) { +- ret = base_length; +- goto immediate_exit; +- } +- +- if (s->bdev_length > base_length) { ++ if (s->bdev_length > target_length) { + ret = blk_truncate(s->target, s->bdev_length, false, + PREALLOC_MODE_OFF, NULL); + if (ret < 0) { + goto immediate_exit; + } + } ++ } else if (s->bdev_length != target_length) { ++ error_setg(errp, "Source and target image have different sizes"); ++ ret = -EINVAL; ++ goto immediate_exit; + } + + if (s->bdev_length == 0) { +-- +1.8.3.1 + diff --git a/kvm-mirror-Store-MirrorOp.co-for-debuggability.patch b/kvm-mirror-Store-MirrorOp.co-for-debuggability.patch new file mode 100755 index 0000000..67f3e54 --- /dev/null +++ b/kvm-mirror-Store-MirrorOp.co-for-debuggability.patch @@ -0,0 +1,51 @@ +From 27fe3b8d42a2c99de01ce20e4b0727079c12da65 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 24 Feb 2020 16:57:09 +0000 +Subject: [PATCH 8/9] mirror: Store MirrorOp.co for debuggability + +RH-Author: Kevin Wolf +Message-id: <20200224165710.4830-2-kwolf@redhat.com> +Patchwork-id: 94044 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] mirror: Store MirrorOp.co for debuggability +Bugzilla: 1794692 +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +If a coroutine is launched, but the coroutine pointer isn't stored +anywhere, debugging any problems inside the coroutine is quite hard. +Let's store the coroutine pointer of a mirror operation in MirrorOp to +have it available in the debugger. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +(cherry picked from commit eed325b92c3e68417121ea23f96e33af6a4654ed) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/block/mirror.c b/block/mirror.c +index f0f2d9d..8959e42 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -103,6 +103,7 @@ struct MirrorOp { + bool is_pseudo_op; + bool is_active_write; + CoQueue waiting_requests; ++ Coroutine *co; + + QTAILQ_ENTRY(MirrorOp) next; + }; +@@ -429,6 +430,7 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset, + default: + abort(); + } ++ op->co = co; + + QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); + qemu_coroutine_enter(co); +-- +1.8.3.1 + diff --git a/kvm-mirror-Wait-only-for-in-flight-operations.patch b/kvm-mirror-Wait-only-for-in-flight-operations.patch new file mode 100755 index 0000000..a06d30e --- /dev/null +++ b/kvm-mirror-Wait-only-for-in-flight-operations.patch @@ -0,0 +1,95 @@ +From bddf389330e11fb0ce17413c1bfa2264a281ded2 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 30 Mar 2020 11:19:24 +0100 +Subject: [PATCH 4/4] mirror: Wait only for in-flight operations + +RH-Author: Kevin Wolf +Message-id: <20200330111924.22938-3-kwolf@redhat.com> +Patchwork-id: 94463 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] mirror: Wait only for in-flight operations +Bugzilla: 1794692 +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +mirror_wait_for_free_in_flight_slot() just picks a random operation to +wait for. However, a MirrorOp is already in s->ops_in_flight when +mirror_co_read() waits for free slots, so if not enough slots are +immediately available, an operation can end up waiting for itself, or +two or more operations can wait for each other to complete, which +results in a hang. + +Fix this by adding a flag to MirrorOp that tells us if the request is +already in flight (and therefore occupies slots that it will later +free), and picking only such operations for waiting. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 +Signed-off-by: Kevin Wolf +Message-Id: <20200326153628.4869-3-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit ce8cabbd17cf738ddfc68384440c38e5dd2fdf97) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 8959e42..5e5a521 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -102,6 +102,7 @@ struct MirrorOp { + + bool is_pseudo_op; + bool is_active_write; ++ bool is_in_flight; + CoQueue waiting_requests; + Coroutine *co; + +@@ -293,7 +294,9 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) + * caller of this function. Since there is only one pseudo op + * at any given time, we will always find some real operation + * to wait on. */ +- if (!op->is_pseudo_op && op->is_active_write == active) { ++ if (!op->is_pseudo_op && op->is_in_flight && ++ op->is_active_write == active) ++ { + qemu_co_queue_wait(&op->waiting_requests, NULL); + return; + } +@@ -367,6 +370,7 @@ static void coroutine_fn mirror_co_read(void *opaque) + /* Copy the dirty cluster. */ + s->in_flight++; + s->bytes_in_flight += op->bytes; ++ op->is_in_flight = true; + trace_mirror_one_iteration(s, op->offset, op->bytes); + + ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes, +@@ -382,6 +386,7 @@ static void coroutine_fn mirror_co_zero(void *opaque) + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; ++ op->is_in_flight = true; + + ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes, + op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0); +@@ -396,6 +401,7 @@ static void coroutine_fn mirror_co_discard(void *opaque) + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; ++ op->is_in_flight = true; + + ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes); + mirror_write_complete(op, ret); +@@ -1306,6 +1312,7 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s, + .offset = offset, + .bytes = bytes, + .is_active_write = true, ++ .is_in_flight = true, + }; + qemu_co_queue_init(&op->waiting_requests); + QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); +-- +1.8.3.1 + diff --git a/kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch b/kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch new file mode 100755 index 0000000..eb4e9af --- /dev/null +++ b/kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch @@ -0,0 +1,255 @@ +From 67878e1306f9ea6ccd30437327147c46de196a36 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:13 -0500 +Subject: [PATCH 13/18] misc: Replace zero-length arrays with flexible array + member (manual) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-10-thuth@redhat.com> +Patchwork-id: 99506 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 09/12] misc: Replace zero-length arrays with flexible array member (manual) +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Philippe Mathieu-Daudé + +Description copied from Linux kernel commit from Gustavo A. R. Silva +(see [3]): + +--v-- description start --v-- + + The current codebase makes use of the zero-length array language + extension to the C90 standard, but the preferred mechanism to + declare variable-length types such as these ones is a flexible + array member [1], introduced in C99: + + struct foo { + int stuff; + struct boo array[]; + }; + + By making use of the mechanism above, we will get a compiler + warning in case the flexible array does not occur last in the + structure, which will help us prevent some kind of undefined + behavior bugs from being unadvertenly introduced [2] to the + Linux codebase from now on. + +--^-- description end --^-- + +Do the similar housekeeping in the QEMU codebase (which uses +C99 since commit 7be41675f7cb). + +All these instances of code were found with the help of the +following command (then manual analysis, without modifying +structures only having a single flexible array member, such +QEDTable in block/qed.h): + + git grep -F '[0];' + +[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html +[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76497732932f +[3] https://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git/commit/?id=17642a2fbd2c1 + +Inspired-by: Gustavo A. R. Silva +Reviewed-by: David Hildenbrand +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +(cherry picked from commit 880a7817c1a82a93d3f83dfb25dce1f0db629c66) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + block/vmdk.c | 2 +- + docs/interop/vhost-user.rst | 4 ++-- + hw/char/sclpconsole-lm.c | 2 +- + hw/char/sclpconsole.c | 2 +- + hw/s390x/virtio-ccw.c | 2 +- + include/hw/acpi/acpi-defs.h | 4 ++-- + include/hw/boards.h | 2 +- + include/hw/s390x/event-facility.h | 2 +- + include/hw/s390x/sclp.h | 8 ++++---- + target/s390x/ioinst.c | 2 +- + 10 files changed, 15 insertions(+), 15 deletions(-) + +diff --git a/block/vmdk.c b/block/vmdk.c +index 1bd39917290..8ec18f35a53 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -187,7 +187,7 @@ typedef struct VmdkMetaData { + typedef struct VmdkGrainMarker { + uint64_t lba; + uint32_t size; +- uint8_t data[0]; ++ uint8_t data[]; + } QEMU_PACKED VmdkGrainMarker; + + enum { +diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst +index 7827b710aa0..71b20ce83dd 100644 +--- a/docs/interop/vhost-user.rst ++++ b/docs/interop/vhost-user.rst +@@ -563,7 +563,7 @@ For split virtqueue, queue region can be implemented as: + uint16_t used_idx; + + /* Used to track the state of each descriptor in descriptor table */ +- DescStateSplit desc[0]; ++ DescStateSplit desc[]; + } QueueRegionSplit; + + To track inflight I/O, the queue region should be processed as follows: +@@ -685,7 +685,7 @@ For packed virtqueue, queue region can be implemented as: + uint8_t padding[7]; + + /* Used to track the state of each descriptor fetched from descriptor ring */ +- DescStatePacked desc[0]; ++ DescStatePacked desc[]; + } QueueRegionPacked; + + To track inflight I/O, the queue region should be processed as follows: +diff --git a/hw/char/sclpconsole-lm.c b/hw/char/sclpconsole-lm.c +index 392606259d5..a9a6f2b204c 100644 +--- a/hw/char/sclpconsole-lm.c ++++ b/hw/char/sclpconsole-lm.c +@@ -31,7 +31,7 @@ + typedef struct OprtnsCommand { + EventBufferHeader header; + MDMSU message_unit; +- char data[0]; ++ char data[]; + } QEMU_PACKED OprtnsCommand; + + /* max size for line-mode data in 4K SCCB page */ +diff --git a/hw/char/sclpconsole.c b/hw/char/sclpconsole.c +index da126f0133f..55697130a0a 100644 +--- a/hw/char/sclpconsole.c ++++ b/hw/char/sclpconsole.c +@@ -25,7 +25,7 @@ + + typedef struct ASCIIConsoleData { + EventBufferHeader ebh; +- char data[0]; ++ char data[]; + } QEMU_PACKED ASCIIConsoleData; + + /* max size for ASCII data in 4K SCCB page */ +diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c +index 6580ce5907d..aa2c75a49c6 100644 +--- a/hw/s390x/virtio-ccw.c ++++ b/hw/s390x/virtio-ccw.c +@@ -193,7 +193,7 @@ typedef struct VirtioThinintInfo { + typedef struct VirtioRevInfo { + uint16_t revision; + uint16_t length; +- uint8_t data[0]; ++ uint8_t data[]; + } QEMU_PACKED VirtioRevInfo; + + /* Specify where the virtqueues for the subchannel are in guest memory. */ +diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h +index 57a3f58b0c9..b80188b430f 100644 +--- a/include/hw/acpi/acpi-defs.h ++++ b/include/hw/acpi/acpi-defs.h +@@ -152,7 +152,7 @@ typedef struct AcpiSerialPortConsoleRedirection + */ + struct AcpiRsdtDescriptorRev1 { + ACPI_TABLE_HEADER_DEF /* ACPI common table header */ +- uint32_t table_offset_entry[0]; /* Array of pointers to other */ ++ uint32_t table_offset_entry[]; /* Array of pointers to other */ + /* ACPI tables */ + } QEMU_PACKED; + typedef struct AcpiRsdtDescriptorRev1 AcpiRsdtDescriptorRev1; +@@ -162,7 +162,7 @@ typedef struct AcpiRsdtDescriptorRev1 AcpiRsdtDescriptorRev1; + */ + struct AcpiXsdtDescriptorRev2 { + ACPI_TABLE_HEADER_DEF /* ACPI common table header */ +- uint64_t table_offset_entry[0]; /* Array of pointers to other */ ++ uint64_t table_offset_entry[]; /* Array of pointers to other */ + /* ACPI tables */ + } QEMU_PACKED; + typedef struct AcpiXsdtDescriptorRev2 AcpiXsdtDescriptorRev2; +diff --git a/include/hw/boards.h b/include/hw/boards.h +index 2920bdef5b4..a5e92f6c373 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -101,7 +101,7 @@ typedef struct CPUArchId { + */ + typedef struct { + int len; +- CPUArchId cpus[0]; ++ CPUArchId cpus[]; + } CPUArchIdList; + + /** +diff --git a/include/hw/s390x/event-facility.h b/include/hw/s390x/event-facility.h +index bdc32a3c091..700a610f33c 100644 +--- a/include/hw/s390x/event-facility.h ++++ b/include/hw/s390x/event-facility.h +@@ -122,7 +122,7 @@ typedef struct MDBO { + + typedef struct MDB { + MdbHeader header; +- MDBO mdbo[0]; ++ MDBO mdbo[]; + } QEMU_PACKED MDB; + + typedef struct SclpMsg { +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index df2fa4169b0..62e2aa1d9f1 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -133,7 +133,7 @@ typedef struct ReadInfo { + uint16_t highest_cpu; + uint8_t _reserved5[124 - 122]; /* 122-123 */ + uint32_t hmfai; +- struct CPUEntry entries[0]; ++ struct CPUEntry entries[]; + } QEMU_PACKED ReadInfo; + + typedef struct ReadCpuInfo { +@@ -143,7 +143,7 @@ typedef struct ReadCpuInfo { + uint16_t nr_standby; /* 12-13 */ + uint16_t offset_standby; /* 14-15 */ + uint8_t reserved0[24-16]; /* 16-23 */ +- struct CPUEntry entries[0]; ++ struct CPUEntry entries[]; + } QEMU_PACKED ReadCpuInfo; + + typedef struct ReadStorageElementInfo { +@@ -152,7 +152,7 @@ typedef struct ReadStorageElementInfo { + uint16_t assigned; + uint16_t standby; + uint8_t _reserved0[16 - 14]; /* 14-15 */ +- uint32_t entries[0]; ++ uint32_t entries[]; + } QEMU_PACKED ReadStorageElementInfo; + + typedef struct AttachStorageElement { +@@ -160,7 +160,7 @@ typedef struct AttachStorageElement { + uint8_t _reserved0[10 - 8]; /* 8-9 */ + uint16_t assigned; + uint8_t _reserved1[16 - 12]; /* 12-15 */ +- uint32_t entries[0]; ++ uint32_t entries[]; + } QEMU_PACKED AttachStorageElement; + + typedef struct AssignStorage { +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index b6be300cc48..a412926d278 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -387,7 +387,7 @@ typedef struct ChscResp { + uint16_t len; + uint16_t code; + uint32_t param; +- char data[0]; ++ char data[]; + } QEMU_PACKED ChscResp; + + #define CHSC_MIN_RESP_LEN 0x0008 +-- +2.27.0 + diff --git a/kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch b/kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch new file mode 100755 index 0000000..bca0b4c --- /dev/null +++ b/kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch @@ -0,0 +1,74 @@ +From 78c7fb5afcb298631df47f6b71cf764f921c15f4 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:38 +0000 +Subject: [PATCH 06/18] multifd: Make sure that we don't do any IO after an + error + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-6-quintela@redhat.com> +Patchwork-id: 94118 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 05/10] multifd: Make sure that we don't do any IO after an error +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 3d4095b222d97393b1c2c6e514951ec7798f1c43) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 22 +++++++++++++--------- + 1 file changed, 13 insertions(+), 9 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 6c55c5d..a0257ee 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3440,7 +3440,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + { + RAMState **temp = opaque; + RAMState *rs = *temp; +- int ret; ++ int ret = 0; + int i; + int64_t t0; + int done = 0; +@@ -3511,12 +3511,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_ROUND); + + out: +- multifd_send_sync_main(rs); +- qemu_put_be64(f, RAM_SAVE_FLAG_EOS); +- qemu_fflush(f); +- ram_counters.transferred += 8; ++ if (ret >= 0) { ++ multifd_send_sync_main(rs); ++ qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ++ qemu_fflush(f); ++ ram_counters.transferred += 8; + +- ret = qemu_file_get_error(f); ++ ret = qemu_file_get_error(f); ++ } + if (ret < 0) { + return ret; + } +@@ -3568,9 +3570,11 @@ static int ram_save_complete(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_FINISH); + } + +- multifd_send_sync_main(rs); +- qemu_put_be64(f, RAM_SAVE_FLAG_EOS); +- qemu_fflush(f); ++ if (ret >= 0) { ++ multifd_send_sync_main(rs); ++ qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ++ qemu_fflush(f); ++ } + + return ret; + } +-- +1.8.3.1 + diff --git a/kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch b/kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch new file mode 100755 index 0000000..94d2c98 --- /dev/null +++ b/kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch @@ -0,0 +1,161 @@ +From f49ff2ed5675f1d0cddc404842e9d6e4e572d5a7 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Wed, 10 Jun 2020 18:32:01 -0400 +Subject: [PATCH 1/2] nbd/server: Avoid long error message assertions + CVE-2020-10761 + +RH-Author: Eric Blake +Message-id: <20200610183202.3780750-2-eblake@redhat.com> +Patchwork-id: 97494 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] nbd/server: Avoid long error message assertions CVE-2020-10761 +Bugzilla: 1845384 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Ever since commit 36683283 (v2.8), the server code asserts that error +strings sent to the client are well-formed per the protocol by not +exceeding the maximum string length of 4096. At the time the server +first started sending error messages, the assertion could not be +triggered, because messages were completely under our control. +However, over the years, we have added latent scenarios where a client +could trigger the server to attempt an error message that would +include the client's information if it passed other checks first: + +- requesting NBD_OPT_INFO/GO on an export name that is not present + (commit 0cfae925 in v2.12 echoes the name) + +- requesting NBD_OPT_LIST/SET_META_CONTEXT on an export name that is + not present (commit e7b1948d in v2.12 echoes the name) + +At the time, those were still safe because we flagged names larger +than 256 bytes with a different message; but that changed in commit +93676c88 (v4.2) when we raised the name limit to 4096 to match the NBD +string limit. (That commit also failed to change the magic number +4096 in nbd_negotiate_send_rep_err to the just-introduced named +constant.) So with that commit, long client names appended to server +text can now trigger the assertion, and thus be used as a denial of +service attack against a server. As a mitigating factor, if the +server requires TLS, the client cannot trigger the problematic paths +unless it first supplies TLS credentials, and such trusted clients are +less likely to try to intentionally crash the server. + +We may later want to further sanitize the user-supplied strings we +place into our error messages, such as scrubbing out control +characters, but that is less important to the CVE fix, so it can be a +later patch to the new nbd_sanitize_name. + +Consideration was given to changing the assertion in +nbd_negotiate_send_rep_verr to instead merely log a server error and +truncate the message, to avoid leaving a latent path that could +trigger a future CVE DoS on any new error message. However, this +merely complicates the code for something that is already (correctly) +flagging coding errors, and now that we are aware of the long message +pitfall, we are less likely to introduce such errors in the future, +which would make such error handling dead code. + +Reported-by: Xueqiang Wei +CC: qemu-stable@nongnu.org +Fixes: https://bugzilla.redhat.com/1843684 CVE-2020-10761 +Fixes: 93676c88d7 +Signed-off-by: Eric Blake +Message-Id: <20200610163741.3745251-2-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 5c4fe018c025740fef4a0a4421e8162db0c3eefd) +Signed-off-by: Eric Blake +Signed-off-by: Eduardo Lima (Etrunko) +--- + nbd/server.c | 23 ++++++++++++++++++++--- + tests/qemu-iotests/143 | 4 ++++ + tests/qemu-iotests/143.out | 2 ++ + 3 files changed, 26 insertions(+), 3 deletions(-) + +diff --git a/nbd/server.c b/nbd/server.c +index 24ebc1a805..d5b9df092c 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -217,7 +217,7 @@ nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type, + + msg = g_strdup_vprintf(fmt, va); + len = strlen(msg); +- assert(len < 4096); ++ assert(len < NBD_MAX_STRING_SIZE); + trace_nbd_negotiate_send_rep_err(msg); + ret = nbd_negotiate_send_rep_len(client, type, len, errp); + if (ret < 0) { +@@ -231,6 +231,19 @@ nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type, + return 0; + } + ++/* ++ * Return a malloc'd copy of @name suitable for use in an error reply. ++ */ ++static char * ++nbd_sanitize_name(const char *name) ++{ ++ if (strnlen(name, 80) < 80) { ++ return g_strdup(name); ++ } ++ /* XXX Should we also try to sanitize any control characters? */ ++ return g_strdup_printf("%.80s...", name); ++} ++ + /* Send an error reply. + * Return -errno on error, 0 on success. */ + static int GCC_FMT_ATTR(4, 5) +@@ -595,9 +608,11 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp) + + exp = nbd_export_find(name); + if (!exp) { ++ g_autofree char *sane_name = nbd_sanitize_name(name); ++ + return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN, + errp, "export '%s' not present", +- name); ++ sane_name); + } + + /* Don't bother sending NBD_INFO_NAME unless client requested it */ +@@ -995,8 +1010,10 @@ static int nbd_negotiate_meta_queries(NBDClient *client, + + meta->exp = nbd_export_find(export_name); + if (meta->exp == NULL) { ++ g_autofree char *sane_name = nbd_sanitize_name(export_name); ++ + return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp, +- "export '%s' not present", export_name); ++ "export '%s' not present", sane_name); + } + + ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp); +diff --git a/tests/qemu-iotests/143 b/tests/qemu-iotests/143 +index f649b36195..d2349903b1 100755 +--- a/tests/qemu-iotests/143 ++++ b/tests/qemu-iotests/143 +@@ -58,6 +58,10 @@ _send_qemu_cmd $QEMU_HANDLE \ + $QEMU_IO_PROG -f raw -c quit \ + "nbd+unix:///no_such_export?socket=$SOCK_DIR/nbd" 2>&1 \ + | _filter_qemu_io | _filter_nbd ++# Likewise, with longest possible name permitted in NBD protocol ++$QEMU_IO_PROG -f raw -c quit \ ++ "nbd+unix:///$(printf %4096d 1 | tr ' ' a)?socket=$SOCK_DIR/nbd" 2>&1 \ ++ | _filter_qemu_io | _filter_nbd | sed 's/aaaa*aa/aa--aa/' + + _send_qemu_cmd $QEMU_HANDLE \ + "{ 'execute': 'quit' }" \ +diff --git a/tests/qemu-iotests/143.out b/tests/qemu-iotests/143.out +index 1f4001c601..fc9c0a761f 100644 +--- a/tests/qemu-iotests/143.out ++++ b/tests/qemu-iotests/143.out +@@ -5,6 +5,8 @@ QA output created by 143 + {"return": {}} + qemu-io: can't open device nbd+unix:///no_such_export?socket=SOCK_DIR/nbd: Requested export not available + server reported: export 'no_such_export' not present ++qemu-io: can't open device nbd+unix:///aa--aa1?socket=SOCK_DIR/nbd: Requested export not available ++server reported: export 'aa--aa...' not present + { 'execute': 'quit' } + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-- +2.27.0 + diff --git a/kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch b/kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch new file mode 100755 index 0000000..654a64f --- /dev/null +++ b/kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch @@ -0,0 +1,301 @@ +From 512c7e92808dff66779f7421f1c17a081f18d7e6 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Thu, 29 Jul 2021 04:56:46 -0400 +Subject: [PATCH 13/14] net: check if the file descriptor is valid before using + it +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210726102337.6359-2-lvivier@redhat.com> +Patchwork-id: 101924 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] net: check if the file descriptor is valid before using it +Bugzilla: 1982134 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Philippe Mathieu-Daudé + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1982134 +BRANCH: rhel-8.5.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=38380653 + +qemu_set_nonblock() checks that the file descriptor can be used and, if +not, crashes QEMU. An assert() is used for that. The use of assert() is +used to detect programming error and the coredump will allow to debug +the problem. + +But in the case of the tap device, this assert() can be triggered by +a misconfiguration by the user. At startup, it's not a real problem, but it +can also happen during the hot-plug of a new device, and here it's a +problem because we can crash a perfectly healthy system. + +For instance: + # ip link add link virbr0 name macvtap0 type macvtap mode bridge + # ip link set macvtap0 up + # TAP=/dev/tap$(ip -o link show macvtap0 | cut -d: -f1) + # qemu-system-x86_64 -machine q35 -device pcie-root-port,id=pcie-root-port-0 -monitor stdio 9<> $TAP + (qemu) netdev_add type=tap,id=hostnet0,vhost=on,fd=9 + (qemu) device_add driver=virtio-net-pci,netdev=hostnet0,id=net0,bus=pcie-root-port-0 + (qemu) device_del net0 + (qemu) netdev_del hostnet0 + (qemu) netdev_add type=tap,id=hostnet1,vhost=on,fd=9 + qemu-system-x86_64: .../util/oslib-posix.c:247: qemu_set_nonblock: Assertion `f != -1' failed. + Aborted (core dumped) + +To avoid that, add a function, qemu_try_set_nonblock(), that allows to report the +problem without crashing. + +In the same way, we also update the function for vhostfd in net_init_tap_one() and +for fd in net_init_socket() (both descriptors are provided by the user and can +be wrong). + +Signed-off-by: Laurent Vivier +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang +(cherry picked from commit 894022e616016fe81745753f14adfbd680a1c7ee) +Signed-off-by: Laurent Vivier +Signed-off-by: Miroslav Rezanina +--- + include/qemu/sockets.h | 1 + + net/socket.c | 9 +++++-- + net/tap.c | 25 +++++++++++++++--- + util/oslib-posix.c | 26 +++++++++++++------ + util/oslib-win32.c | 57 ++++++++++++++++++++++++------------------ + 5 files changed, 79 insertions(+), 39 deletions(-) + +diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h +index 57cd049d6e..7d1f813576 100644 +--- a/include/qemu/sockets.h ++++ b/include/qemu/sockets.h +@@ -18,6 +18,7 @@ int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen); + int socket_set_cork(int fd, int v); + int socket_set_nodelay(int fd); + void qemu_set_block(int fd); ++int qemu_try_set_nonblock(int fd); + void qemu_set_nonblock(int fd); + int socket_set_fast_reuse(int fd); + +diff --git a/net/socket.c b/net/socket.c +index c92354049b..2d21fddd9c 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -725,13 +725,18 @@ int net_init_socket(const Netdev *netdev, const char *name, + } + + if (sock->has_fd) { +- int fd; ++ int fd, ret; + + fd = monitor_fd_param(cur_mon, sock->fd, errp); + if (fd == -1) { + return -1; + } +- qemu_set_nonblock(fd); ++ ret = qemu_try_set_nonblock(fd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ return -1; ++ } + if (!net_socket_fd_init(peer, "socket", name, fd, 1, sock->mcast, + errp)) { + return -1; +diff --git a/net/tap.c b/net/tap.c +index 6207f61f84..41a20102fd 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -689,6 +689,8 @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, + } + + if (vhostfdname) { ++ int ret; ++ + vhostfd = monitor_fd_param(cur_mon, vhostfdname, &err); + if (vhostfd == -1) { + if (tap->has_vhostforce && tap->vhostforce) { +@@ -698,7 +700,12 @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, + } + return; + } +- qemu_set_nonblock(vhostfd); ++ ret = qemu_try_set_nonblock(vhostfd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ return; ++ } + } else { + vhostfd = open("/dev/vhost-net", O_RDWR); + if (vhostfd < 0) { +@@ -766,6 +773,7 @@ int net_init_tap(const Netdev *netdev, const char *name, + Error *err = NULL; + const char *vhostfdname; + char ifname[128]; ++ int ret = 0; + + assert(netdev->type == NET_CLIENT_DRIVER_TAP); + tap = &netdev->u.tap; +@@ -795,7 +803,12 @@ int net_init_tap(const Netdev *netdev, const char *name, + return -1; + } + +- qemu_set_nonblock(fd); ++ ret = qemu_try_set_nonblock(fd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ return -1; ++ } + + vnet_hdr = tap_probe_vnet_hdr(fd); + +@@ -810,7 +823,6 @@ int net_init_tap(const Netdev *netdev, const char *name, + char **fds; + char **vhost_fds; + int nfds = 0, nvhosts = 0; +- int ret = 0; + + if (tap->has_ifname || tap->has_script || tap->has_downscript || + tap->has_vnet_hdr || tap->has_helper || tap->has_queues || +@@ -843,7 +855,12 @@ int net_init_tap(const Netdev *netdev, const char *name, + goto free_fail; + } + +- qemu_set_nonblock(fd); ++ ret = qemu_try_set_nonblock(fd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ goto free_fail; ++ } + + if (i == 0) { + vnet_hdr = tap_probe_vnet_hdr(fd); +diff --git a/util/oslib-posix.c b/util/oslib-posix.c +index 8f88e4dbe1..db70416dbb 100644 +--- a/util/oslib-posix.c ++++ b/util/oslib-posix.c +@@ -240,25 +240,35 @@ void qemu_set_block(int fd) + assert(f != -1); + } + +-void qemu_set_nonblock(int fd) ++int qemu_try_set_nonblock(int fd) + { + int f; + f = fcntl(fd, F_GETFL); +- assert(f != -1); +- f = fcntl(fd, F_SETFL, f | O_NONBLOCK); +-#ifdef __OpenBSD__ + if (f == -1) { ++ return -errno; ++ } ++ if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) { ++#ifdef __OpenBSD__ + /* + * Previous to OpenBSD 6.3, fcntl(F_SETFL) is not permitted on + * memory devices and sets errno to ENODEV. + * It's OK if we fail to set O_NONBLOCK on devices like /dev/null, + * because they will never block anyway. + */ +- assert(errno == ENODEV); +- } +-#else +- assert(f != -1); ++ if (errno == ENODEV) { ++ return 0; ++ } + #endif ++ return -errno; ++ } ++ return 0; ++} ++ ++void qemu_set_nonblock(int fd) ++{ ++ int f; ++ f = qemu_try_set_nonblock(fd); ++ assert(f == 0); + } + + int socket_set_fast_reuse(int fd) +diff --git a/util/oslib-win32.c b/util/oslib-win32.c +index 3b49d27297..7eedbe5859 100644 +--- a/util/oslib-win32.c ++++ b/util/oslib-win32.c +@@ -132,31 +132,6 @@ struct tm *localtime_r(const time_t *timep, struct tm *result) + } + #endif /* CONFIG_LOCALTIME_R */ + +-void qemu_set_block(int fd) +-{ +- unsigned long opt = 0; +- WSAEventSelect(fd, NULL, 0); +- ioctlsocket(fd, FIONBIO, &opt); +-} +- +-void qemu_set_nonblock(int fd) +-{ +- unsigned long opt = 1; +- ioctlsocket(fd, FIONBIO, &opt); +- qemu_fd_register(fd); +-} +- +-int socket_set_fast_reuse(int fd) +-{ +- /* Enabling the reuse of an endpoint that was used by a socket still in +- * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows +- * fast reuse is the default and SO_REUSEADDR does strange things. So we +- * don't have to do anything here. More info can be found at: +- * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */ +- return 0; +-} +- +- + static int socket_error(void) + { + switch (WSAGetLastError()) { +@@ -233,6 +208,38 @@ static int socket_error(void) + } + } + ++void qemu_set_block(int fd) ++{ ++ unsigned long opt = 0; ++ WSAEventSelect(fd, NULL, 0); ++ ioctlsocket(fd, FIONBIO, &opt); ++} ++ ++int qemu_try_set_nonblock(int fd) ++{ ++ unsigned long opt = 1; ++ if (ioctlsocket(fd, FIONBIO, &opt) != NO_ERROR) { ++ return -socket_error(); ++ } ++ qemu_fd_register(fd); ++ return 0; ++} ++ ++void qemu_set_nonblock(int fd) ++{ ++ (void)qemu_try_set_nonblock(fd); ++} ++ ++int socket_set_fast_reuse(int fd) ++{ ++ /* Enabling the reuse of an endpoint that was used by a socket still in ++ * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows ++ * fast reuse is the default and SO_REUSEADDR does strange things. So we ++ * don't have to do anything here. More info can be found at: ++ * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */ ++ return 0; ++} ++ + int inet_aton(const char *cp, struct in_addr *ia) + { + uint32_t addr = inet_addr(cp); +-- +2.27.0 + diff --git a/kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch b/kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch new file mode 100755 index 0000000..8718c71 --- /dev/null +++ b/kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch @@ -0,0 +1,221 @@ +From 3475ea6598896edb689ca8ba6fb81781e2517b6f Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Thu, 29 Jul 2021 04:56:49 -0400 +Subject: [PATCH 14/14] net: detect errors from probing vnet hdr flag for TAP + devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210726102337.6359-3-lvivier@redhat.com> +Patchwork-id: 101923 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] net: detect errors from probing vnet hdr flag for TAP devices +Bugzilla: 1982134 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Daniel P. Berrange" + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1982134 +BRANCH: rhel-8.5.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=38380653 + +When QEMU sets up a tap based network device backend, it mostly ignores errors +reported from various ioctl() calls it makes, assuming the TAP file descriptor +is valid. This assumption can easily be violated when the user is passing in a +pre-opened file descriptor. At best, the ioctls may fail with a -EBADF, but if +the user passes in a bogus FD number that happens to clash with a FD number that +QEMU has opened internally for another reason, a wide variety of errnos may +result, as the TUNGETIFF ioctl number may map to a completely different command +on a different type of file. + +By ignoring all these errors, QEMU sets up a zombie network backend that will +never pass any data. Even worse, when QEMU shuts down, or that network backend +is hot-removed, it will close this bogus file descriptor, which could belong to +another QEMU device backend. + +There's no obvious guaranteed reliable way to detect that a FD genuinely is a +TAP device, as opposed to a UNIX socket, or pipe, or something else. Checking +the errno from probing vnet hdr flag though, does catch the big common cases. +ie calling TUNGETIFF will return EBADF for an invalid FD, and ENOTTY when FD is +a UNIX socket, or pipe which catches accidental collisions with FDs used for +stdio, or monitor socket. + +Previously the example below where bogus fd 9 collides with the FD used for the +chardev saw: + +$ ./x86_64-softmmu/qemu-system-x86_64 -netdev tap,id=hostnet0,fd=9 \ + -chardev socket,id=charchannel0,path=/tmp/qga,server,nowait \ + -monitor stdio -vnc :0 +qemu-system-x86_64: -netdev tap,id=hostnet0,fd=9: TUNGETIFF ioctl() failed: Inappropriate ioctl for device +TUNSETOFFLOAD ioctl() failed: Bad address +QEMU 2.9.1 monitor - type 'help' for more information +(qemu) Warning: netdev hostnet0 has no peer + +which gives a running QEMU with a zombie network backend. + +With this change applied we get an error message and QEMU immediately exits +before carrying on and making a bigger disaster: + +$ ./x86_64-softmmu/qemu-system-x86_64 -netdev tap,id=hostnet0,fd=9 \ + -chardev socket,id=charchannel0,path=/tmp/qga,server,nowait \ + -monitor stdio -vnc :0 +qemu-system-x86_64: -netdev tap,id=hostnet0,vhost=on,fd=9: Unable to query TUNGETIFF on FD 9: Inappropriate ioctl for device + +Reported-by: Dr. David Alan Gilbert +Signed-off-by: Daniel P. Berrange +Tested-by: Dr. David Alan Gilbert +Message-id: 20171027085548.3472-1-berrange@redhat.com +[lv: to simplify, don't check on EINVAL with TUNGETIFF as it exists since v2.6.27] +Signed-off-by: Laurent Vivier +Signed-off-by: Jason Wang +(cherry picked from commit e7b347d0bf640adb1c998d317eaf44d2d7cbd973) +Signed-off-by: Laurent Vivier +Signed-off-by: Miroslav Rezanina +--- + net/tap-bsd.c | 2 +- + net/tap-linux.c | 8 +++++--- + net/tap-solaris.c | 2 +- + net/tap-stub.c | 2 +- + net/tap.c | 25 ++++++++++++++++++++----- + net/tap_int.h | 2 +- + 6 files changed, 29 insertions(+), 12 deletions(-) + +diff --git a/net/tap-bsd.c b/net/tap-bsd.c +index a5c3707f80..77aaf674b1 100644 +--- a/net/tap-bsd.c ++++ b/net/tap-bsd.c +@@ -211,7 +211,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + { + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + return 0; + } +diff --git a/net/tap-linux.c b/net/tap-linux.c +index e0dd442ee3..b0635e9e32 100644 +--- a/net/tap-linux.c ++++ b/net/tap-linux.c +@@ -147,13 +147,15 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + } + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + struct ifreq ifr; + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) { +- error_report("TUNGETIFF ioctl() failed: %s", strerror(errno)); +- return 0; ++ /* TUNGETIFF is available since kernel v2.6.27 */ ++ error_setg_errno(errp, errno, ++ "Unable to query TUNGETIFF on FD %d", fd); ++ return -1; + } + + return ifr.ifr_flags & IFF_VNET_HDR; +diff --git a/net/tap-solaris.c b/net/tap-solaris.c +index 4725d2314e..ae2ba68284 100644 +--- a/net/tap-solaris.c ++++ b/net/tap-solaris.c +@@ -206,7 +206,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + { + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + return 0; + } +diff --git a/net/tap-stub.c b/net/tap-stub.c +index a9ab8f8293..de525a2e69 100644 +--- a/net/tap-stub.c ++++ b/net/tap-stub.c +@@ -37,7 +37,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + { + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + return 0; + } +diff --git a/net/tap.c b/net/tap.c +index 41a20102fd..b37ccae00c 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -597,7 +597,11 @@ int net_init_bridge(const Netdev *netdev, const char *name, + } + + qemu_set_nonblock(fd); +- vnet_hdr = tap_probe_vnet_hdr(fd); ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ close(fd); ++ return -1; ++ } + s = net_tap_fd_init(peer, "bridge", name, fd, vnet_hdr); + + snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper, +@@ -810,7 +814,11 @@ int net_init_tap(const Netdev *netdev, const char *name, + return -1; + } + +- vnet_hdr = tap_probe_vnet_hdr(fd); ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ close(fd); ++ return -1; ++ } + + net_init_tap_one(tap, peer, "tap", name, NULL, + script, downscript, +@@ -863,8 +871,11 @@ int net_init_tap(const Netdev *netdev, const char *name, + } + + if (i == 0) { +- vnet_hdr = tap_probe_vnet_hdr(fd); +- } else if (vnet_hdr != tap_probe_vnet_hdr(fd)) { ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ goto free_fail; ++ } ++ } else if (vnet_hdr != tap_probe_vnet_hdr(fd, NULL)) { + error_setg(errp, + "vnet_hdr not consistent across given tap fds"); + ret = -1; +@@ -909,7 +920,11 @@ free_fail: + } + + qemu_set_nonblock(fd); +- vnet_hdr = tap_probe_vnet_hdr(fd); ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ close(fd); ++ return -1; ++ } + + net_init_tap_one(tap, peer, "bridge", name, ifname, + script, downscript, vhostfdname, +diff --git a/net/tap_int.h b/net/tap_int.h +index e3194b23f4..225a49ea48 100644 +--- a/net/tap_int.h ++++ b/net/tap_int.h +@@ -34,7 +34,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen); + + void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp); +-int tap_probe_vnet_hdr(int fd); ++int tap_probe_vnet_hdr(int fd, Error **errp); + int tap_probe_vnet_hdr_len(int fd, int len); + int tap_probe_has_ufo(int fd); + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); +-- +2.27.0 + diff --git a/kvm-net-forbid-the-reentrant-RX.patch b/kvm-net-forbid-the-reentrant-RX.patch new file mode 100755 index 0000000..aaf57ed --- /dev/null +++ b/kvm-net-forbid-the-reentrant-RX.patch @@ -0,0 +1,50 @@ +From 1e01e2f96fd5e903394eab59365d5363394c8b18 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 18:59:12 -0400 +Subject: [PATCH 3/5] net: forbid the reentrant RX + +RH-Author: Jon Maloy +Message-id: <20210413185912.3811035-2-jmaloy@redhat.com> +Patchwork-id: 101467 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] net: forbid the reentrant RX +Bugzilla: 1859175 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Xiao Wang + +From: Jason Wang + +The memory API allows DMA into NIC's MMIO area. This means the NIC's +RX routine must be reentrant. Instead of auditing all the NIC, we can +simply detect the reentrancy and return early. The queue->delivering +is set and cleared by qemu_net_queue_deliver() for other queue helpers +to know whether the delivering in on going (NIC's receive is being +called). We can check it and return early in qemu_net_queue_flush() to +forbid reentrant RX. + +Signed-off-by: Jason Wang + +(cherry picked from commit 22dc8663d9fc7baa22100544c600b6285a63c7a3) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + net/queue.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/queue.c b/net/queue.c +index 61276ca4be..c679d79f4b 100644 +--- a/net/queue.c ++++ b/net/queue.c +@@ -250,6 +250,9 @@ void qemu_net_queue_purge(NetQueue *queue, NetClientState *from) + + bool qemu_net_queue_flush(NetQueue *queue) + { ++ if (queue->delivering) ++ return false; ++ + while (!QTAILQ_EMPTY(&queue->packets)) { + NetPacket *packet; + int ret; +-- +2.27.0 + diff --git a/kvm-net-introduce-qemu_receive_packet.patch b/kvm-net-introduce-qemu_receive_packet.patch new file mode 100755 index 0000000..8de8cae --- /dev/null +++ b/kvm-net-introduce-qemu_receive_packet.patch @@ -0,0 +1,187 @@ +From 89732bf03b26daaebbd3e6e031e79459ae3f77e1 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:39 -0400 +Subject: [PATCH 1/9] net: introduce qemu_receive_packet() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-2-jmaloy@redhat.com> +Patchwork-id: 101785 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 1/9] net: introduce qemu_receive_packet() +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +Some NIC supports loopback mode and this is done by calling +nc->info->receive() directly which in fact suppresses the effort of +reentrancy check that is done in qemu_net_queue_send(). + +Unfortunately we can't use qemu_net_queue_send() here since for +loopback there's no sender as peer, so this patch introduce a +qemu_receive_packet() which is used for implementing loopback mode +for a NIC with this check. + +NIC that supports loopback mode will be converted to this helper. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Reviewed-by: Philippe Mathieu-Daudé +Cc: qemu-stable@nongnu.org +Signed-off-by: Jason Wang + +(cherry picked from commit 705df5466c98f3efdd2b68d3b31dad86858acad7) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + include/net/net.h | 5 +++++ + include/net/queue.h | 8 ++++++++ + net/net.c | 38 +++++++++++++++++++++++++++++++------- + net/queue.c | 22 ++++++++++++++++++++++ + 4 files changed, 66 insertions(+), 7 deletions(-) + +diff --git a/include/net/net.h b/include/net/net.h +index e175ba9677..1b32a8aaec 100644 +--- a/include/net/net.h ++++ b/include/net/net.h +@@ -142,12 +142,17 @@ void *qemu_get_nic_opaque(NetClientState *nc); + void qemu_del_net_client(NetClientState *nc); + typedef void (*qemu_nic_foreach)(NICState *nic, void *opaque); + void qemu_foreach_nic(qemu_nic_foreach func, void *opaque); ++int qemu_can_receive_packet(NetClientState *nc); + int qemu_can_send_packet(NetClientState *nc); + ssize_t qemu_sendv_packet(NetClientState *nc, const struct iovec *iov, + int iovcnt); + ssize_t qemu_sendv_packet_async(NetClientState *nc, const struct iovec *iov, + int iovcnt, NetPacketSent *sent_cb); + ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size); ++ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size); ++ssize_t qemu_receive_packet_iov(NetClientState *nc, ++ const struct iovec *iov, ++ int iovcnt); + ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size); + ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf, + int size, NetPacketSent *sent_cb); +diff --git a/include/net/queue.h b/include/net/queue.h +index c0269bb1dc..9f2f289d77 100644 +--- a/include/net/queue.h ++++ b/include/net/queue.h +@@ -55,6 +55,14 @@ void qemu_net_queue_append_iov(NetQueue *queue, + + void qemu_del_net_queue(NetQueue *queue); + ++ssize_t qemu_net_queue_receive(NetQueue *queue, ++ const uint8_t *data, ++ size_t size); ++ ++ssize_t qemu_net_queue_receive_iov(NetQueue *queue, ++ const struct iovec *iov, ++ int iovcnt); ++ + ssize_t qemu_net_queue_send(NetQueue *queue, + NetClientState *sender, + unsigned flags, +diff --git a/net/net.c b/net/net.c +index 84aa6d8d00..d0b651ca95 100644 +--- a/net/net.c ++++ b/net/net.c +@@ -516,6 +516,17 @@ int qemu_set_vnet_be(NetClientState *nc, bool is_be) + #endif + } + ++int qemu_can_receive_packet(NetClientState *nc) ++{ ++ if (nc->receive_disabled) { ++ return 0; ++ } else if (nc->info->can_receive && ++ !nc->info->can_receive(nc)) { ++ return 0; ++ } ++ return 1; ++} ++ + int qemu_can_send_packet(NetClientState *sender) + { + int vm_running = runstate_is_running(); +@@ -528,13 +539,7 @@ int qemu_can_send_packet(NetClientState *sender) + return 1; + } + +- if (sender->peer->receive_disabled) { +- return 0; +- } else if (sender->peer->info->can_receive && +- !sender->peer->info->can_receive(sender->peer)) { +- return 0; +- } +- return 1; ++ return qemu_can_receive_packet(sender->peer); + } + + static ssize_t filter_receive_iov(NetClientState *nc, +@@ -667,6 +672,25 @@ ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size) + return qemu_send_packet_async(nc, buf, size, NULL); + } + ++ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size) ++{ ++ if (!qemu_can_receive_packet(nc)) { ++ return 0; ++ } ++ ++ return qemu_net_queue_receive(nc->incoming_queue, buf, size); ++} ++ ++ssize_t qemu_receive_packet_iov(NetClientState *nc, const struct iovec *iov, ++ int iovcnt) ++{ ++ if (!qemu_can_receive_packet(nc)) { ++ return 0; ++ } ++ ++ return qemu_net_queue_receive_iov(nc->incoming_queue, iov, iovcnt); ++} ++ + ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size) + { + return qemu_send_packet_async_with_flags(nc, QEMU_NET_PACKET_FLAG_RAW, +diff --git a/net/queue.c b/net/queue.c +index c679d79f4b..5f0f9ffcaf 100644 +--- a/net/queue.c ++++ b/net/queue.c +@@ -182,6 +182,28 @@ static ssize_t qemu_net_queue_deliver_iov(NetQueue *queue, + return ret; + } + ++ssize_t qemu_net_queue_receive(NetQueue *queue, ++ const uint8_t *data, ++ size_t size) ++{ ++ if (queue->delivering) { ++ return 0; ++ } ++ ++ return qemu_net_queue_deliver(queue, NULL, 0, data, size); ++} ++ ++ssize_t qemu_net_queue_receive_iov(NetQueue *queue, ++ const struct iovec *iov, ++ int iovcnt) ++{ ++ if (queue->delivering) { ++ return 0; ++ } ++ ++ return qemu_net_queue_deliver_iov(queue, NULL, 0, iov, iovcnt); ++} ++ + ssize_t qemu_net_queue_send(NetQueue *queue, + NetClientState *sender, + unsigned flags, +-- +2.27.0 + diff --git a/kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch b/kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch new file mode 100755 index 0000000..b619e78 --- /dev/null +++ b/kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch @@ -0,0 +1,59 @@ +From b7de63e72c479df42c324c058a487517210fa069 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 19:21:50 -0400 +Subject: [PATCH 1/5] net: remove an assert call in eth_get_gso_type + +RH-Author: Jon Maloy +Message-id: <20210413192150.3817133-2-jmaloy@redhat.com> +Patchwork-id: 101469 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] net: remove an assert call in eth_get_gso_type +Bugzilla: 1892350 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Xiao Wang + +From: Prasad J Pandit + +eth_get_gso_type() routine returns segmentation offload type based on +L3 protocol type. It calls g_assert_not_reached if L3 protocol is +unknown, making the following return statement unreachable. Remove the +g_assert call, it maybe triggered by a guest user. + +Reported-by: Gaoning Pan +Signed-off-by: Prasad J Pandit +Signed-off-by: Jason Wang + +(cherry picked from commit 7564bf7701f00214cdc8a678a9f7df765244def1) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + net/eth.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/eth.c b/net/eth.c +index 0c1d413ee2..1e0821c5f8 100644 +--- a/net/eth.c ++++ b/net/eth.c +@@ -16,6 +16,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/log.h" + #include "net/eth.h" + #include "net/checksum.h" + #include "net/tap.h" +@@ -71,9 +72,8 @@ eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto) + return VIRTIO_NET_HDR_GSO_TCPV6 | ecn_state; + } + } +- +- /* Unsupported offload */ +- g_assert_not_reached(); ++ qemu_log_mask(LOG_UNIMP, "%s: probably not GSO frame, " ++ "unknown L3 protocol: 0x%04"PRIx16"\n", __func__, l3_proto); + + return VIRTIO_NET_HDR_GSO_NONE | ecn_state; + } +-- +2.27.0 + diff --git a/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch b/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch new file mode 100755 index 0000000..6d9382c --- /dev/null +++ b/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch @@ -0,0 +1,318 @@ +From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:47 +0100 +Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for + numa nodes + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-4-plai@redhat.com> +Patchwork-id: 96736 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Tao Xu + +In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), +The initiator represents processor which access to memory. And in 5.2.27.3 +Memory Proximity Domain Attributes Structure, the attached initiator is +defined as where the memory controller responsible for a memory proximity +domain. With attached initiator information, the topology of heterogeneous +memory can be described. Add new machine property 'hmat' to enable all +HMAT specific options. + +Extend CLI of "-numa node" option to indicate the initiator numa node-id. +In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report +the platform's HMAT tables. Before using initiator option, enable HMAT with +-machine hmat=on. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Jingqi Liu +Suggested-by: Dan Williams +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-2-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/machine.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/core/numa.c | 23 ++++++++++++++++++ + include/sysemu/numa.h | 5 ++++ + qapi/machine.json | 10 +++++++- + qemu-options.hx | 35 ++++++++++++++++++++++++---- + 5 files changed, 131 insertions(+), 6 deletions(-) + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index 19c78c6..cb21ae1 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) + ms->nvdimms_state->is_enabled = value; + } + ++static bool machine_get_hmat(Object *obj, Error **errp) ++{ ++ MachineState *ms = MACHINE(obj); ++ ++ return ms->numa_state->hmat_enabled; ++} ++ ++static void machine_set_hmat(Object *obj, bool value, Error **errp) ++{ ++ MachineState *ms = MACHINE(obj); ++ ++ ms->numa_state->hmat_enabled = value; ++} ++ + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) + { + MachineState *ms = MACHINE(obj); +@@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine, + const CpuInstanceProperties *props, Error **errp) + { + MachineClass *mc = MACHINE_GET_CLASS(machine); ++ NodeInfo *numa_info = machine->numa_state->nodes; + bool match = false; + int i; + +@@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine, + match = true; + slot->props.node_id = props->node_id; + slot->props.has_node_id = props->has_node_id; ++ ++ if (machine->numa_state->hmat_enabled) { ++ if ((numa_info[props->node_id].initiator < MAX_NODES) && ++ (props->node_id != numa_info[props->node_id].initiator)) { ++ error_setg(errp, "The initiator of CPU NUMA node %" PRId64 ++ " should be itself", props->node_id); ++ return; ++ } ++ numa_info[props->node_id].has_cpu = true; ++ numa_info[props->node_id].initiator = props->node_id; ++ } + } + + if (!match) { +@@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj) + + if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { + ms->numa_state = g_new0(NumaState, 1); ++ object_property_add_bool(obj, "hmat", ++ machine_get_hmat, machine_set_hmat, ++ &error_abort); ++ object_property_set_description(obj, "hmat", ++ "Set on/off to enable/disable " ++ "ACPI Heterogeneous Memory Attribute " ++ "Table (HMAT)", NULL); + } + + /* Register notifier when init is done for sysbus sanity checks */ +@@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) + return g_string_free(s, false); + } + ++static void numa_validate_initiator(NumaState *numa_state) ++{ ++ int i; ++ NodeInfo *numa_info = numa_state->nodes; ++ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ if (numa_info[i].initiator == MAX_NODES) { ++ error_report("The initiator of NUMA node %d is missing, use " ++ "'-numa node,initiator' option to declare it", i); ++ exit(1); ++ } ++ ++ if (!numa_info[numa_info[i].initiator].present) { ++ error_report("NUMA node %" PRIu16 " is missing, use " ++ "'-numa node' option to declare it first", ++ numa_info[i].initiator); ++ exit(1); ++ } ++ ++ if (!numa_info[numa_info[i].initiator].has_cpu) { ++ error_report("The initiator of NUMA node %d is invalid", i); ++ exit(1); ++ } ++ } ++} ++ + static void machine_numa_finish_cpu_init(MachineState *machine) + { + int i; +@@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine) + machine_set_cpu_numa_node(machine, &props, &error_fatal); + } + } ++ ++ if (machine->numa_state->hmat_enabled) { ++ numa_validate_initiator(machine->numa_state); ++ } ++ + if (s->len && !qtest_enabled()) { + warn_report("CPU(s) not present in any NUMA nodes: %s", + s->str); +diff --git a/hw/core/numa.c b/hw/core/numa.c +index 19f082d..a07eef9 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, + numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); + numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); + } ++ ++ /* ++ * If not set the initiator, set it to MAX_NODES. And if ++ * HMAT is enabled and this node has no cpus, QEMU will raise error. ++ */ ++ numa_info[nodenr].initiator = MAX_NODES; ++ if (node->has_initiator) { ++ if (!ms->numa_state->hmat_enabled) { ++ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " ++ "(HMAT) is disabled, enable it with -machine hmat=on " ++ "before using any of hmat specific options"); ++ return; ++ } ++ ++ if (node->initiator >= MAX_NODES) { ++ error_report("The initiator id %" PRIu16 " expects an integer " ++ "between 0 and %d", node->initiator, ++ MAX_NODES - 1); ++ return; ++ } ++ ++ numa_info[nodenr].initiator = node->initiator; ++ } + numa_info[nodenr].present = true; + max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); + ms->numa_state->num_nodes++; +diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h +index ae9c41d..788cbec 100644 +--- a/include/sysemu/numa.h ++++ b/include/sysemu/numa.h +@@ -18,6 +18,8 @@ struct NodeInfo { + uint64_t node_mem; + struct HostMemoryBackend *node_memdev; + bool present; ++ bool has_cpu; ++ uint16_t initiator; + uint8_t distance[MAX_NODES]; + }; + +@@ -33,6 +35,9 @@ struct NumaState { + /* Allow setting NUMA distance for different NUMA nodes */ + bool have_numa_distance; + ++ /* Detect if HMAT support is enabled. */ ++ bool hmat_enabled; ++ + /* NUMA nodes information */ + NodeInfo nodes[MAX_NODES]; + }; +diff --git a/qapi/machine.json b/qapi/machine.json +index ca26779..27d0e37 100644 +--- a/qapi/machine.json ++++ b/qapi/machine.json +@@ -463,6 +463,13 @@ + # @memdev: memory backend object. If specified for one node, + # it must be specified for all nodes. + # ++# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, ++# points to the nodeid which has the memory controller ++# responsible for this NUMA node. This field provides ++# additional information as to the initiator node that ++# is closest (as in directly attached) to this node, and ++# therefore has the best performance (since 5.0) ++# + # Since: 2.1 + ## + { 'struct': 'NumaNodeOptions', +@@ -470,7 +477,8 @@ + '*nodeid': 'uint16', + '*cpus': ['uint16'], + '*mem': 'size', +- '*memdev': 'str' }} ++ '*memdev': 'str', ++ '*initiator': 'uint16' }} + + ## + # @NumaDistOptions: +diff --git a/qemu-options.hx b/qemu-options.hx +index df1d27b..e2ce754 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ + " suppress-vmdesc=on|off disables self-describing migration (default=off)\n" + " nvdimm=on|off controls NVDIMM support (default=off)\n" + " enforce-config-section=on|off enforce configuration section migration (default=off)\n" +- " memory-encryption=@var{} memory encryption object to use (default=none)\n", ++ " memory-encryption=@var{} memory encryption object to use (default=none)\n" ++ " hmat=on|off controls ACPI HMAT support (default=off)\n", + QEMU_ARCH_ALL) + STEXI + @item -machine [type=]@var{name}[,prop=@var{value}[,...]] +@@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global} + @option{migration.send-configuration}=@var{on|off} instead. + @item memory-encryption=@var{} + Memory encryption object to use. The default is none. ++@item hmat=on|off ++Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support. ++The default is off. + @end table + ETEXI + +@@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi + ETEXI + + DEF("numa", HAS_ARG, QEMU_OPTION_numa, +- "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" +- "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" ++ "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" ++ "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n" + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", + QEMU_ARCH_ALL) + STEXI +-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] +-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] ++@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] ++@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] + @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} + @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] + @findex -numa +@@ -215,6 +219,27 @@ split equally between them. + @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, + if one node uses @samp{memdev}, all of them have to use it. + ++@samp{initiator} is an additional option that points to an @var{initiator} ++NUMA node that has best performance (the lowest latency or largest bandwidth) ++to this NUMA @var{node}. Note that this option can be set only when ++the machine property 'hmat' is set to 'on'. ++ ++Following example creates a machine with 2 NUMA nodes, node 0 has CPU. ++node 1 has only memory, and its initiator is node 0. Note that because ++node 0 has CPU, by default the initiator of node 0 is itself and must be ++itself. ++@example ++-machine hmat=on \ ++-m 2G,slots=2,maxmem=4G \ ++-object memory-backend-ram,size=1G,id=m0 \ ++-object memory-backend-ram,size=1G,id=m1 \ ++-numa node,nodeid=0,memdev=m0 \ ++-numa node,nodeid=1,memdev=m1,initiator=0 \ ++-smp 2,sockets=2,maxcpus=2 \ ++-numa cpu,node-id=0,socket-id=0 \ ++-numa cpu,node-id=0,socket-id=1 ++@end example ++ + @var{source} and @var{destination} are NUMA node IDs. + @var{distance} is the NUMA distance from @var{source} to @var{destination}. + The distance from a node to itself is always 10. If any pair of nodes is +-- +1.8.3.1 + diff --git a/kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch b/kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch new file mode 100755 index 0000000..306abeb --- /dev/null +++ b/kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch @@ -0,0 +1,545 @@ +From 32341d8cf680625def040b44d70b197f2399bbdb Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:48 +0100 +Subject: [PATCH 05/12] numa: Extend CLI to provide memory latency and + bandwidth information + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-5-plai@redhat.com> +Patchwork-id: 96731 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 04/11] numa: Extend CLI to provide memory latency and bandwidth information +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +Add -numa hmat-lb option to provide System Locality Latency and +Bandwidth Information. These memory attributes help to build +System Locality Latency and Bandwidth Information Structure(s) +in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using +hmat-lb option, enable HMAT with -machine hmat=on. + +Acked-by: Markus Armbruster +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-3-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Igor Mammedov +(cherry picked from commit 9b12dfa03a94d7f7a4b54eb67229a31e58193384) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/numa.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++ + include/sysemu/numa.h | 53 ++++++++++++++ + qapi/machine.json | 93 +++++++++++++++++++++++- + qemu-options.hx | 47 +++++++++++- + 4 files changed, 384 insertions(+), 3 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index a07eef9..58fe713 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -23,6 +23,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/units.h" + #include "sysemu/hostmem.h" + #include "sysemu/numa.h" + #include "sysemu/sysemu.h" +@@ -194,6 +195,186 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) + ms->numa_state->have_numa_distance = true; + } + ++void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, ++ Error **errp) ++{ ++ int i, first_bit, last_bit; ++ uint64_t max_entry, temp_base, bitmap_copy; ++ NodeInfo *numa_info = numa_state->nodes; ++ HMAT_LB_Info *hmat_lb = ++ numa_state->hmat_lb[node->hierarchy][node->data_type]; ++ HMAT_LB_Data lb_data = {}; ++ HMAT_LB_Data *lb_temp; ++ ++ /* Error checking */ ++ if (node->initiator > numa_state->num_nodes) { ++ error_setg(errp, "Invalid initiator=%d, it should be less than %d", ++ node->initiator, numa_state->num_nodes); ++ return; ++ } ++ if (node->target > numa_state->num_nodes) { ++ error_setg(errp, "Invalid target=%d, it should be less than %d", ++ node->target, numa_state->num_nodes); ++ return; ++ } ++ if (!numa_info[node->initiator].has_cpu) { ++ error_setg(errp, "Invalid initiator=%d, it isn't an " ++ "initiator proximity domain", node->initiator); ++ return; ++ } ++ if (!numa_info[node->target].present) { ++ error_setg(errp, "The target=%d should point to an existing node", ++ node->target); ++ return; ++ } ++ ++ if (!hmat_lb) { ++ hmat_lb = g_malloc0(sizeof(*hmat_lb)); ++ numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; ++ hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); ++ } ++ hmat_lb->hierarchy = node->hierarchy; ++ hmat_lb->data_type = node->data_type; ++ lb_data.initiator = node->initiator; ++ lb_data.target = node->target; ++ ++ if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { ++ /* Input latency data */ ++ ++ if (!node->has_latency) { ++ error_setg(errp, "Missing 'latency' option"); ++ return; ++ } ++ if (node->has_bandwidth) { ++ error_setg(errp, "Invalid option 'bandwidth' since " ++ "the data type is latency"); ++ return; ++ } ++ ++ /* Detect duplicate configuration */ ++ for (i = 0; i < hmat_lb->list->len; i++) { ++ lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); ++ ++ if (node->initiator == lb_temp->initiator && ++ node->target == lb_temp->target) { ++ error_setg(errp, "Duplicate configuration of the latency for " ++ "initiator=%d and target=%d", node->initiator, ++ node->target); ++ return; ++ } ++ } ++ ++ hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; ++ ++ if (node->latency) { ++ /* Calculate the temporary base and compressed latency */ ++ max_entry = node->latency; ++ temp_base = 1; ++ while (QEMU_IS_ALIGNED(max_entry, 10)) { ++ max_entry /= 10; ++ temp_base *= 10; ++ } ++ ++ /* Calculate the max compressed latency */ ++ temp_base = MIN(hmat_lb->base, temp_base); ++ max_entry = node->latency / hmat_lb->base; ++ max_entry = MAX(hmat_lb->range_bitmap, max_entry); ++ ++ /* ++ * For latency hmat_lb->range_bitmap record the max compressed ++ * latency which should be less than 0xFFFF (UINT16_MAX) ++ */ ++ if (max_entry >= UINT16_MAX) { ++ error_setg(errp, "Latency %" PRIu64 " between initiator=%d and " ++ "target=%d should not differ from previously entered " ++ "min or max values on more than %d", node->latency, ++ node->initiator, node->target, UINT16_MAX - 1); ++ return; ++ } else { ++ hmat_lb->base = temp_base; ++ hmat_lb->range_bitmap = max_entry; ++ } ++ ++ /* ++ * Set lb_info_provided bit 0 as 1, ++ * latency information is provided ++ */ ++ numa_info[node->target].lb_info_provided |= BIT(0); ++ } ++ lb_data.data = node->latency; ++ } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) { ++ /* Input bandwidth data */ ++ if (!node->has_bandwidth) { ++ error_setg(errp, "Missing 'bandwidth' option"); ++ return; ++ } ++ if (node->has_latency) { ++ error_setg(errp, "Invalid option 'latency' since " ++ "the data type is bandwidth"); ++ return; ++ } ++ if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) { ++ error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and " ++ "target=%d should be 1MB aligned", node->bandwidth, ++ node->initiator, node->target); ++ return; ++ } ++ ++ /* Detect duplicate configuration */ ++ for (i = 0; i < hmat_lb->list->len; i++) { ++ lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); ++ ++ if (node->initiator == lb_temp->initiator && ++ node->target == lb_temp->target) { ++ error_setg(errp, "Duplicate configuration of the bandwidth for " ++ "initiator=%d and target=%d", node->initiator, ++ node->target); ++ return; ++ } ++ } ++ ++ hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1; ++ ++ if (node->bandwidth) { ++ /* Keep bitmap unchanged when bandwidth out of range */ ++ bitmap_copy = hmat_lb->range_bitmap; ++ bitmap_copy |= node->bandwidth; ++ first_bit = ctz64(bitmap_copy); ++ temp_base = UINT64_C(1) << first_bit; ++ max_entry = node->bandwidth / temp_base; ++ last_bit = 64 - clz64(bitmap_copy); ++ ++ /* ++ * For bandwidth, first_bit record the base unit of bandwidth bits, ++ * last_bit record the last bit of the max bandwidth. The max ++ * compressed bandwidth should be less than 0xFFFF (UINT16_MAX) ++ */ ++ if ((last_bit - first_bit) > UINT16_BITS || ++ max_entry >= UINT16_MAX) { ++ error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d " ++ "and target=%d should not differ from previously " ++ "entered values on more than %d", node->bandwidth, ++ node->initiator, node->target, UINT16_MAX - 1); ++ return; ++ } else { ++ hmat_lb->base = temp_base; ++ hmat_lb->range_bitmap = bitmap_copy; ++ } ++ ++ /* ++ * Set lb_info_provided bit 1 as 1, ++ * bandwidth information is provided ++ */ ++ numa_info[node->target].lb_info_provided |= BIT(1); ++ } ++ lb_data.data = node->bandwidth; ++ } else { ++ assert(0); ++ } ++ ++ g_array_append_val(hmat_lb->list, lb_data); ++} ++ + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + { + Error *err = NULL; +@@ -231,6 +412,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), + &err); + break; ++ case NUMA_OPTIONS_TYPE_HMAT_LB: ++ if (!ms->numa_state->hmat_enabled) { ++ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " ++ "(HMAT) is disabled, enable it with -machine hmat=on " ++ "before using any of hmat specific options"); ++ return; ++ } ++ ++ parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err); ++ if (err) { ++ goto end; ++ } ++ break; + default: + abort(); + } +diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h +index 788cbec..70f93c8 100644 +--- a/include/sysemu/numa.h ++++ b/include/sysemu/numa.h +@@ -14,11 +14,34 @@ struct CPUArchId; + #define NUMA_DISTANCE_MAX 254 + #define NUMA_DISTANCE_UNREACHABLE 255 + ++/* the value of AcpiHmatLBInfo flags */ ++enum { ++ HMAT_LB_MEM_MEMORY = 0, ++ HMAT_LB_MEM_CACHE_1ST_LEVEL = 1, ++ HMAT_LB_MEM_CACHE_2ND_LEVEL = 2, ++ HMAT_LB_MEM_CACHE_3RD_LEVEL = 3, ++ HMAT_LB_LEVELS /* must be the last entry */ ++}; ++ ++/* the value of AcpiHmatLBInfo data type */ ++enum { ++ HMAT_LB_DATA_ACCESS_LATENCY = 0, ++ HMAT_LB_DATA_READ_LATENCY = 1, ++ HMAT_LB_DATA_WRITE_LATENCY = 2, ++ HMAT_LB_DATA_ACCESS_BANDWIDTH = 3, ++ HMAT_LB_DATA_READ_BANDWIDTH = 4, ++ HMAT_LB_DATA_WRITE_BANDWIDTH = 5, ++ HMAT_LB_TYPES /* must be the last entry */ ++}; ++ ++#define UINT16_BITS 16 ++ + struct NodeInfo { + uint64_t node_mem; + struct HostMemoryBackend *node_memdev; + bool present; + bool has_cpu; ++ uint8_t lb_info_provided; + uint16_t initiator; + uint8_t distance[MAX_NODES]; + }; +@@ -28,6 +51,31 @@ struct NumaNodeMem { + uint64_t node_plugged_mem; + }; + ++struct HMAT_LB_Data { ++ uint8_t initiator; ++ uint8_t target; ++ uint64_t data; ++}; ++typedef struct HMAT_LB_Data HMAT_LB_Data; ++ ++struct HMAT_LB_Info { ++ /* Indicates it's memory or the specified level memory side cache. */ ++ uint8_t hierarchy; ++ ++ /* Present the type of data, access/read/write latency or bandwidth. */ ++ uint8_t data_type; ++ ++ /* The range bitmap of bandwidth for calculating common base */ ++ uint64_t range_bitmap; ++ ++ /* The common base unit for latencies or bandwidths */ ++ uint64_t base; ++ ++ /* Array to store the latencies or bandwidths */ ++ GArray *list; ++}; ++typedef struct HMAT_LB_Info HMAT_LB_Info; ++ + struct NumaState { + /* Number of NUMA nodes */ + int num_nodes; +@@ -40,11 +88,16 @@ struct NumaState { + + /* NUMA nodes information */ + NodeInfo nodes[MAX_NODES]; ++ ++ /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ ++ HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; + }; + typedef struct NumaState NumaState; + + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); + void parse_numa_opts(MachineState *ms); ++void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, ++ Error **errp); + void numa_complete_configuration(MachineState *ms); + void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); + extern QemuOptsList qemu_numa_opts; +diff --git a/qapi/machine.json b/qapi/machine.json +index 27d0e37..cf8faf5 100644 +--- a/qapi/machine.json ++++ b/qapi/machine.json +@@ -426,10 +426,12 @@ + # + # @cpu: property based CPU(s) to node mapping (Since: 2.10) + # ++# @hmat-lb: memory latency and bandwidth information (Since: 5.0) ++# + # Since: 2.1 + ## + { 'enum': 'NumaOptionsType', +- 'data': [ 'node', 'dist', 'cpu' ] } ++ 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } + + ## + # @NumaOptions: +@@ -444,7 +446,8 @@ + 'data': { + 'node': 'NumaNodeOptions', + 'dist': 'NumaDistOptions', +- 'cpu': 'NumaCpuOptions' }} ++ 'cpu': 'NumaCpuOptions', ++ 'hmat-lb': 'NumaHmatLBOptions' }} + + ## + # @NumaNodeOptions: +@@ -558,6 +561,92 @@ + 'data' : {} } + + ## ++# @HmatLBMemoryHierarchy: ++# ++# The memory hierarchy in the System Locality Latency and Bandwidth ++# Information Structure of HMAT (Heterogeneous Memory Attribute Table) ++# ++# For more information about @HmatLBMemoryHierarchy, see chapter ++# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec. ++# ++# @memory: the structure represents the memory performance ++# ++# @first-level: first level of memory side cache ++# ++# @second-level: second level of memory side cache ++# ++# @third-level: third level of memory side cache ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatLBMemoryHierarchy', ++ 'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] } ++ ++## ++# @HmatLBDataType: ++# ++# Data type in the System Locality Latency and Bandwidth ++# Information Structure of HMAT (Heterogeneous Memory Attribute Table) ++# ++# For more information about @HmatLBDataType, see chapter ++# 5.2.27.4: Table 5-146: Field "Data Type" of ACPI 6.3 spec. ++# ++# @access-latency: access latency (nanoseconds) ++# ++# @read-latency: read latency (nanoseconds) ++# ++# @write-latency: write latency (nanoseconds) ++# ++# @access-bandwidth: access bandwidth (Bytes per second) ++# ++# @read-bandwidth: read bandwidth (Bytes per second) ++# ++# @write-bandwidth: write bandwidth (Bytes per second) ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatLBDataType', ++ 'data': [ 'access-latency', 'read-latency', 'write-latency', ++ 'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] } ++ ++## ++# @NumaHmatLBOptions: ++# ++# Set the system locality latency and bandwidth information ++# between Initiator and Target proximity Domains. ++# ++# For more information about @NumaHmatLBOptions, see chapter ++# 5.2.27.4: Table 5-146 of ACPI 6.3 spec. ++# ++# @initiator: the Initiator Proximity Domain. ++# ++# @target: the Target Proximity Domain. ++# ++# @hierarchy: the Memory Hierarchy. Indicates the performance ++# of memory or side cache. ++# ++# @data-type: presents the type of data, access/read/write ++# latency or hit latency. ++# ++# @latency: the value of latency from @initiator to @target ++# proximity domain, the latency unit is "ns(nanosecond)". ++# ++# @bandwidth: the value of bandwidth between @initiator and @target ++# proximity domain, the bandwidth unit is ++# "Bytes per second". ++# ++# Since: 5.0 ++## ++{ 'struct': 'NumaHmatLBOptions', ++ 'data': { ++ 'initiator': 'uint16', ++ 'target': 'uint16', ++ 'hierarchy': 'HmatLBMemoryHierarchy', ++ 'data-type': 'HmatLBDataType', ++ '*latency': 'uint64', ++ '*bandwidth': 'size' }} ++ ++## + # @HostMemPolicy: + # + # Host memory policy types +diff --git a/qemu-options.hx b/qemu-options.hx +index e2ce754..86d9d8a 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -168,16 +168,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, + "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n" +- "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", ++ "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" ++ "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n", + QEMU_ARCH_ALL) + STEXI + @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] + @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] + @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} + @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] ++@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] + @findex -numa + Define a NUMA node and assign RAM and VCPUs to it. + Set the NUMA distance from a source node to a destination node. ++Set the ACPI Heterogeneous Memory Attributes for the given nodes. + + Legacy VCPU assignment uses @samp{cpus} option where + @var{firstcpu} and @var{lastcpu} are CPU indexes. Each +@@ -256,6 +259,48 @@ specified resources, it just assigns existing resources to NUMA + nodes. This means that one still has to use the @option{-m}, + @option{-smp} options to allocate RAM and VCPUs respectively. + ++Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information ++between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT). ++Initiator NUMA node can create memory requests, usually it has one or more processors. ++Target NUMA node contains addressable memory. ++ ++In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{hierarchy} is the memory ++hierarchy of the target NUMA node: if @var{hierarchy} is 'memory', the structure ++represents the memory performance; if @var{hierarchy} is 'first-level|second-level|third-level', ++this structure represents aggregated performance of memory side caches for each domain. ++@var{type} of 'data-type' is type of data represented by this structure instance: ++if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency or 'access|read|write' ++bandwidth of the target memory; if 'hierarchy' is 'first-level|second-level|third-level', ++'data-type' is 'access|read|write' hit latency or 'access|read|write' hit bandwidth of the ++target memory side cache. ++ ++@var{lat} is latency value in nanoseconds. @var{bw} is bandwidth value, ++the possible value and units are NUM[M|G|T], mean that the bandwidth value are ++NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). ++Note that if latency or bandwidth value is 0, means the corresponding latency or ++bandwidth information is not provided. ++ ++For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and ++a ram, node 1 has only a ram. The processors in node 0 access memory in node ++0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; ++The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 ++nanoseconds, access-bandwidth is 100 MB/s. ++@example ++-machine hmat=on \ ++-m 2G \ ++-object memory-backend-ram,size=1G,id=m0 \ ++-object memory-backend-ram,size=1G,id=m1 \ ++-smp 2 \ ++-numa node,nodeid=0,memdev=m0 \ ++-numa node,nodeid=1,memdev=m1,initiator=0 \ ++-numa cpu,node-id=0,socket-id=0 \ ++-numa cpu,node-id=0,socket-id=1 \ ++-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ ++-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ ++-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ ++-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M ++@end example ++ + ETEXI + + DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, +-- +1.8.3.1 + diff --git a/kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch b/kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch new file mode 100755 index 0000000..a17db22 --- /dev/null +++ b/kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch @@ -0,0 +1,326 @@ +From 8cd3544b1347b248b9d04eb3d6c9b9bde3a13655 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:49 +0100 +Subject: [PATCH 06/12] numa: Extend CLI to provide memory side cache + information + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-6-plai@redhat.com> +Patchwork-id: 96740 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 05/11] numa: Extend CLI to provide memory side cache information +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +Add -numa hmat-cache option to provide Memory Side Cache Information. +These memory attributes help to build Memory Side Cache Information +Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). +Before using hmat-cache option, enable HMAT with -machine hmat=on. + +Acked-by: Markus Armbruster +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-4-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Igor Mammedov +(cherry picked from commit c412a48d4d91e8f8b89aae02de0f44f1f0b729e5) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/numa.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ + include/sysemu/numa.h | 5 ++++ + qapi/machine.json | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-- + qemu-options.hx | 17 +++++++++-- + 4 files changed, 179 insertions(+), 4 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index 58fe713..0d1b4be 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -375,6 +375,73 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + g_array_append_val(hmat_lb->list, lb_data); + } + ++void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, ++ Error **errp) ++{ ++ int nb_numa_nodes = ms->numa_state->num_nodes; ++ NodeInfo *numa_info = ms->numa_state->nodes; ++ NumaHmatCacheOptions *hmat_cache = NULL; ++ ++ if (node->node_id >= nb_numa_nodes) { ++ error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " ++ "than %d", node->node_id, nb_numa_nodes); ++ return; ++ } ++ ++ if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { ++ error_setg(errp, "The latency and bandwidth information of " ++ "node-id=%" PRIu32 " should be provided before memory side " ++ "cache attributes", node->node_id); ++ return; ++ } ++ ++ if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { ++ error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " ++ "and less than or equal to %d", node->level, ++ HMAT_LB_LEVELS - 1); ++ return; ++ } ++ ++ assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); ++ assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); ++ if (ms->numa_state->hmat_cache[node->node_id][node->level]) { ++ error_setg(errp, "Duplicate configuration of the side cache for " ++ "node-id=%" PRIu32 " and level=%" PRIu8, ++ node->node_id, node->level); ++ return; ++ } ++ ++ if ((node->level > 1) && ++ ms->numa_state->hmat_cache[node->node_id][node->level - 1] && ++ (node->size >= ++ ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { ++ error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 ++ " should be less than the size(%" PRIu64 ") of " ++ "level=%u", node->size, node->level, ++ ms->numa_state->hmat_cache[node->node_id] ++ [node->level - 1]->size, ++ node->level - 1); ++ return; ++ } ++ ++ if ((node->level < HMAT_LB_LEVELS - 1) && ++ ms->numa_state->hmat_cache[node->node_id][node->level + 1] && ++ (node->size <= ++ ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { ++ error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 ++ " should be larger than the size(%" PRIu64 ") of " ++ "level=%u", node->size, node->level, ++ ms->numa_state->hmat_cache[node->node_id] ++ [node->level + 1]->size, ++ node->level + 1); ++ return; ++ } ++ ++ hmat_cache = g_malloc0(sizeof(*hmat_cache)); ++ memcpy(hmat_cache, node, sizeof(*hmat_cache)); ++ ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; ++} ++ + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + { + Error *err = NULL; +@@ -425,6 +492,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + goto end; + } + break; ++ case NUMA_OPTIONS_TYPE_HMAT_CACHE: ++ if (!ms->numa_state->hmat_enabled) { ++ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " ++ "(HMAT) is disabled, enable it with -machine hmat=on " ++ "before using any of hmat specific options"); ++ return; ++ } ++ ++ parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err); ++ if (err) { ++ goto end; ++ } ++ break; + default: + abort(); + } +diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h +index 70f93c8..ba693cc 100644 +--- a/include/sysemu/numa.h ++++ b/include/sysemu/numa.h +@@ -91,6 +91,9 @@ struct NumaState { + + /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ + HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; ++ ++ /* Memory Side Cache Information Structure */ ++ NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS]; + }; + typedef struct NumaState NumaState; + +@@ -98,6 +101,8 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); + void parse_numa_opts(MachineState *ms); + void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp); ++void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, ++ Error **errp); + void numa_complete_configuration(MachineState *ms); + void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); + extern QemuOptsList qemu_numa_opts; +diff --git a/qapi/machine.json b/qapi/machine.json +index cf8faf5..b3d30bc 100644 +--- a/qapi/machine.json ++++ b/qapi/machine.json +@@ -428,10 +428,12 @@ + # + # @hmat-lb: memory latency and bandwidth information (Since: 5.0) + # ++# @hmat-cache: memory side cache information (Since: 5.0) ++# + # Since: 2.1 + ## + { 'enum': 'NumaOptionsType', +- 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } ++ 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } + + ## + # @NumaOptions: +@@ -447,7 +449,8 @@ + 'node': 'NumaNodeOptions', + 'dist': 'NumaDistOptions', + 'cpu': 'NumaCpuOptions', +- 'hmat-lb': 'NumaHmatLBOptions' }} ++ 'hmat-lb': 'NumaHmatLBOptions', ++ 'hmat-cache': 'NumaHmatCacheOptions' }} + + ## + # @NumaNodeOptions: +@@ -647,6 +650,80 @@ + '*bandwidth': 'size' }} + + ## ++# @HmatCacheAssociativity: ++# ++# Cache associativity in the Memory Side Cache Information Structure ++# of HMAT ++# ++# For more information of @HmatCacheAssociativity, see chapter ++# 5.2.27.5: Table 5-147 of ACPI 6.3 spec. ++# ++# @none: None (no memory side cache in this proximity domain, ++# or cache associativity unknown) ++# ++# @direct: Direct Mapped ++# ++# @complex: Complex Cache Indexing (implementation specific) ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatCacheAssociativity', ++ 'data': [ 'none', 'direct', 'complex' ] } ++ ++## ++# @HmatCacheWritePolicy: ++# ++# Cache write policy in the Memory Side Cache Information Structure ++# of HMAT ++# ++# For more information of @HmatCacheWritePolicy, see chapter ++# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. ++# ++# @none: None (no memory side cache in this proximity domain, ++# or cache write policy unknown) ++# ++# @write-back: Write Back (WB) ++# ++# @write-through: Write Through (WT) ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatCacheWritePolicy', ++ 'data': [ 'none', 'write-back', 'write-through' ] } ++ ++## ++# @NumaHmatCacheOptions: ++# ++# Set the memory side cache information for a given memory domain. ++# ++# For more information of @NumaHmatCacheOptions, see chapter ++# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. ++# ++# @node-id: the memory proximity domain to which the memory belongs. ++# ++# @size: the size of memory side cache in bytes. ++# ++# @level: the cache level described in this structure. ++# ++# @associativity: the cache associativity, ++# none/direct-mapped/complex(complex cache indexing). ++# ++# @policy: the write policy, none/write-back/write-through. ++# ++# @line: the cache Line size in bytes. ++# ++# Since: 5.0 ++## ++{ 'struct': 'NumaHmatCacheOptions', ++ 'data': { ++ 'node-id': 'uint32', ++ 'size': 'size', ++ 'level': 'uint8', ++ 'associativity': 'HmatCacheAssociativity', ++ 'policy': 'HmatCacheWritePolicy', ++ 'line': 'uint16' }} ++ ++## + # @HostMemPolicy: + # + # Host memory policy types +diff --git a/qemu-options.hx b/qemu-options.hx +index 86d9d8a..8fe05b6 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -169,7 +169,8 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n" + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" +- "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n", ++ "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n" ++ "-numa hmat-cache,node-id=node,size=size,level=level[,associativity=none|direct|complex][,policy=none|write-back|write-through][,line=size]\n", + QEMU_ARCH_ALL) + STEXI + @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] +@@ -177,6 +178,7 @@ STEXI + @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} + @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] + @itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] ++@itemx -numa hmat-cache,node-id=@var{node},size=@var{size},level=@var{level}[,associativity=@var{str}][,policy=@var{str}][,line=@var{size}] + @findex -numa + Define a NUMA node and assign RAM and VCPUs to it. + Set the NUMA distance from a source node to a destination node. +@@ -280,11 +282,20 @@ NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). + Note that if latency or bandwidth value is 0, means the corresponding latency or + bandwidth information is not provided. + ++In @samp{hmat-cache} option, @var{node-id} is the NUMA-id of the memory belongs. ++@var{size} is the size of memory side cache in bytes. @var{level} is the cache ++level described in this structure, note that the cache level 0 should not be used ++with @samp{hmat-cache} option. @var{associativity} is the cache associativity, ++the possible value is 'none/direct(direct-mapped)/complex(complex cache indexing)'. ++@var{policy} is the write policy. @var{line} is the cache Line size in bytes. ++ + For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and + a ram, node 1 has only a ram. The processors in node 0 access memory in node + 0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; + The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 + nanoseconds, access-bandwidth is 100 MB/s. ++And for memory side cache information, NUMA node 0 and 1 both have 1 level memory ++cache, size is 10KB, policy is write-back, the cache Line size is 8 bytes: + @example + -machine hmat=on \ + -m 2G \ +@@ -298,7 +309,9 @@ nanoseconds, access-bandwidth is 100 MB/s. + -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ + -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ + -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ +--numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M ++-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M \ ++-numa hmat-cache,node-id=0,size=10K,level=1,associativity=direct,policy=write-back,line=8 \ ++-numa hmat-cache,node-id=1,size=10K,level=1,associativity=direct,policy=write-back,line=8 + @end example + + ETEXI +-- +1.8.3.1 + diff --git a/kvm-numa-properly-check-if-numa-is-supported.patch b/kvm-numa-properly-check-if-numa-is-supported.patch new file mode 100755 index 0000000..c602256 --- /dev/null +++ b/kvm-numa-properly-check-if-numa-is-supported.patch @@ -0,0 +1,81 @@ +From e3a1c2ff0d7b930b1782d59d093fd15471d3aee1 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:46 +0100 +Subject: [PATCH 03/12] numa: properly check if numa is supported + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-3-plai@redhat.com> +Patchwork-id: 96732 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 02/11] numa: properly check if numa is supported +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Igor Mammedov + +Commit aa57020774b, by mistake used MachineClass::numa_mem_supported +to check if NUMA is supported by machine and also as unrelated change +set it to true for sbsa-ref board. + +Luckily change didn't break machines that support NUMA, as the field +is set to true for them. + +But the field is not intended for checking if NUMA is supported and +will be flipped to false within this release for new machine types. + +Fix it: + - by using previously used condition + !mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id + the first time and then use MachineState::numa_state down the road + to check if NUMA is supported + - dropping stray sbsa-ref chunk + +Fixes: aa57020774b690a22be72453b8e91c9b5a68c516 +Signed-off-by: Igor Mammedov +Message-Id: <1576154936-178362-3-git-send-email-imammedo@redhat.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit fcd3f2cc124600385dba46c69a80626985c15b50) +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/sbsa-ref.c | 1 - + hw/core/machine.c | 4 ++-- + 2 files changed, 2 insertions(+), 3 deletions(-) + +diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c +index 27046cc..c6261d4 100644 +--- a/hw/arm/sbsa-ref.c ++++ b/hw/arm/sbsa-ref.c +@@ -791,7 +791,6 @@ static void sbsa_ref_class_init(ObjectClass *oc, void *data) + mc->possible_cpu_arch_ids = sbsa_ref_possible_cpu_arch_ids; + mc->cpu_index_to_instance_props = sbsa_ref_cpu_index_to_props; + mc->get_default_cpu_node_id = sbsa_ref_get_default_cpu_node_id; +- mc->numa_mem_supported = true; + } + + static const TypeInfo sbsa_ref_info = { +diff --git a/hw/core/machine.c b/hw/core/machine.c +index 5a025d1..19c78c6 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -1128,7 +1128,7 @@ static void machine_initfn(Object *obj) + NULL); + } + +- if (mc->numa_mem_supported) { ++ if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { + ms->numa_state = g_new0(NumaState, 1); + } + +@@ -1272,7 +1272,7 @@ void machine_run_board_init(MachineState *machine) + { + MachineClass *machine_class = MACHINE_GET_CLASS(machine); + +- if (machine_class->numa_mem_supported) { ++ if (machine->numa_state) { + numa_complete_configuration(machine); + if (machine->numa_state->num_nodes) { + machine_numa_finish_cpu_init(machine); +-- +1.8.3.1 + diff --git a/kvm-numa-remove-not-needed-check.patch b/kvm-numa-remove-not-needed-check.patch new file mode 100755 index 0000000..cbe677f --- /dev/null +++ b/kvm-numa-remove-not-needed-check.patch @@ -0,0 +1,59 @@ +From 348115bbd0d60fada6f7d9fa27848044690a4bc3 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:45 +0100 +Subject: [PATCH 02/12] numa: remove not needed check + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-2-plai@redhat.com> +Patchwork-id: 96738 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 01/11] numa: remove not needed check +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Igor Mammedov + +Currently parse_numa_node() is always called from already numa +enabled context. +Drop unnecessary check if numa is supported. + +Signed-off-by: Igor Mammedov +Message-Id: <1576154936-178362-2-git-send-email-imammedo@redhat.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 5275db59aa7ff8a26bd6aa5d07cb4d53de5cfab5) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/numa.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index e3332a9..19f082d 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -83,10 +83,6 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, + return; + } + +- if (!mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id) { +- error_setg(errp, "NUMA is not supported by this machine-type"); +- return; +- } + for (cpus = node->cpus; cpus; cpus = cpus->next) { + CpuInstanceProperties props; + if (cpus->value >= max_cpus) { +@@ -178,9 +174,8 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + { + Error *err = NULL; +- MachineClass *mc = MACHINE_GET_CLASS(ms); + +- if (!mc->numa_mem_supported) { ++ if (!ms->numa_state) { + error_setg(errp, "NUMA is not supported by this machine-type"); + goto end; + } +-- +1.8.3.1 + diff --git a/kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch b/kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch new file mode 100755 index 0000000..008874f --- /dev/null +++ b/kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch @@ -0,0 +1,250 @@ +From aac48d07764ce73c2ba23e3f05ccd29db190024a Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Thu, 8 Oct 2020 11:06:43 -0400 +Subject: [PATCH 04/14] nvram: Exit QEMU if NVRAM cannot contain all -prom-env + data + +RH-Author: Greg Kurz +Message-id: <20201008110643.155902-2-gkurz@redhat.com> +Patchwork-id: 98577 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] nvram: Exit QEMU if NVRAM cannot contain all -prom-env data +Bugzilla: 1874780 +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +Since commit 61f20b9dc5b7 ("spapr_nvram: Pre-initialize the NVRAM to +support the -prom-env parameter"), pseries machines can pre-initialize +the "system" partition in the NVRAM with the data passed to all -prom-env +parameters on the QEMU command line. + +In this case it is assumed that all the data fits in 64 KiB, but the user +can easily pass more and crash QEMU: + +$ qemu-system-ppc64 -M pseries $(for ((x=0;x<128;x++)); do \ + echo -n " -prom-env " ; printf "%0.sx" {1..1024}; \ + done) # this requires ~128 Kib +malloc(): corrupted top size +Aborted (core dumped) + +This happens because we don't check if all the prom-env data fits in +the NVRAM and chrp_nvram_set_var() happily memcpy() it passed the +buffer. + +This crash affects basically all ppc/ppc64 machine types that use -prom-env: +- pseries (all versions) +- g3beige +- mac99 + +and also sparc/sparc64 machine types: +- LX +- SPARCClassic +- SPARCbook +- SS-10 +- SS-20 +- SS-4 +- SS-5 +- SS-600MP +- Voyager +- sun4u +- sun4v + +Add a max_len argument to chrp_nvram_create_system_partition() so that +it can check the available size before writing to memory. + +Since NVRAM is populated at machine init, it seems reasonable to consider +this error as fatal. So, instead of reporting an error when we detect that +the NVRAM is too small and adapt all machine types to handle it, we simply +exit QEMU in all cases. This is still better than crashing. If someone +wants another behavior, I guess this can be reworked later. + +Tested with: + +$ yes q | \ + (for arch in ppc ppc64 sparc sparc64; do \ + echo == $arch ==; \ + qemu=${arch}-softmmu/qemu-system-$arch; \ + for mach in $($qemu -M help | awk '! /^Supported/ { print $1 }'); do \ + echo $mach; \ + $qemu -M $mach -monitor stdio -nodefaults -nographic \ + $(for ((x=0;x<128;x++)); do \ + echo -n " -prom-env " ; printf "%0.sx" {1..1024}; \ + done) >/dev/null; \ + done; echo; \ + done) + +Without the patch, affected machine types cause QEMU to report some +memory corruption and crash: + +malloc(): corrupted top size + +free(): invalid size + +*** stack smashing detected ***: terminated + +With the patch, QEMU prints the following message and exits: + +NVRAM is too small. Try to pass less data to -prom-env + +It seems that the conditions for the crash have always existed, but it +affects pseries, the machine type I care for, since commit 61f20b9dc5b7 +only. + +Fixes: 61f20b9dc5b7 ("spapr_nvram: Pre-initialize the NVRAM to support the -prom-env parameter") +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1867739 +Reported-by: John Snow +Reviewed-by: Laurent Vivier +Signed-off-by: Greg Kurz +Message-Id: <159736033937.350502.12402444542194031035.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 37035df51eaabb8d26b71da75b88a1c6727de8fa) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/nvram/chrp_nvram.c | 24 +++++++++++++++++++++--- + hw/nvram/mac_nvram.c | 2 +- + hw/nvram/spapr_nvram.c | 3 ++- + hw/sparc/sun4m.c | 2 +- + hw/sparc64/sun4u.c | 2 +- + include/hw/nvram/chrp_nvram.h | 3 ++- + 6 files changed, 28 insertions(+), 8 deletions(-) + +diff --git a/hw/nvram/chrp_nvram.c b/hw/nvram/chrp_nvram.c +index d969f26704..d4d10a7c03 100644 +--- a/hw/nvram/chrp_nvram.c ++++ b/hw/nvram/chrp_nvram.c +@@ -21,14 +21,21 @@ + + #include "qemu/osdep.h" + #include "qemu/cutils.h" ++#include "qemu/error-report.h" + #include "hw/nvram/chrp_nvram.h" + #include "sysemu/sysemu.h" + +-static int chrp_nvram_set_var(uint8_t *nvram, int addr, const char *str) ++static int chrp_nvram_set_var(uint8_t *nvram, int addr, const char *str, ++ int max_len) + { + int len; + + len = strlen(str) + 1; ++ ++ if (max_len < len) { ++ return -1; ++ } ++ + memcpy(&nvram[addr], str, len); + + return addr + len; +@@ -38,19 +45,26 @@ static int chrp_nvram_set_var(uint8_t *nvram, int addr, const char *str) + * Create a "system partition", used for the Open Firmware + * environment variables. + */ +-int chrp_nvram_create_system_partition(uint8_t *data, int min_len) ++int chrp_nvram_create_system_partition(uint8_t *data, int min_len, int max_len) + { + ChrpNvramPartHdr *part_header; + unsigned int i; + int end; + ++ if (max_len < sizeof(*part_header)) { ++ goto fail; ++ } ++ + part_header = (ChrpNvramPartHdr *)data; + part_header->signature = CHRP_NVPART_SYSTEM; + pstrcpy(part_header->name, sizeof(part_header->name), "system"); + + end = sizeof(ChrpNvramPartHdr); + for (i = 0; i < nb_prom_envs; i++) { +- end = chrp_nvram_set_var(data, end, prom_envs[i]); ++ end = chrp_nvram_set_var(data, end, prom_envs[i], max_len - end); ++ if (end == -1) { ++ goto fail; ++ } + } + + /* End marker */ +@@ -65,6 +79,10 @@ int chrp_nvram_create_system_partition(uint8_t *data, int min_len) + chrp_nvram_finish_partition(part_header, end); + + return end; ++ ++fail: ++ error_report("NVRAM is too small. Try to pass less data to -prom-env"); ++ exit(EXIT_FAILURE); + } + + /** +diff --git a/hw/nvram/mac_nvram.c b/hw/nvram/mac_nvram.c +index 9a47e35b8e..ecfb36182f 100644 +--- a/hw/nvram/mac_nvram.c ++++ b/hw/nvram/mac_nvram.c +@@ -152,7 +152,7 @@ static void pmac_format_nvram_partition_of(MacIONVRAMState *nvr, int off, + + /* OpenBIOS nvram variables partition */ + sysp_end = chrp_nvram_create_system_partition(&nvr->data[off], +- DEF_SYSTEM_SIZE) + off; ++ DEF_SYSTEM_SIZE, len) + off; + + /* Free space partition */ + chrp_nvram_create_free_partition(&nvr->data[sysp_end], len - sysp_end); +diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c +index 838082b451..225cd69b49 100644 +--- a/hw/nvram/spapr_nvram.c ++++ b/hw/nvram/spapr_nvram.c +@@ -188,7 +188,8 @@ static void spapr_nvram_realize(SpaprVioDevice *dev, Error **errp) + } + } else if (nb_prom_envs > 0) { + /* Create a system partition to pass the -prom-env variables */ +- chrp_nvram_create_system_partition(nvram->buf, MIN_NVRAM_SIZE / 4); ++ chrp_nvram_create_system_partition(nvram->buf, MIN_NVRAM_SIZE / 4, ++ nvram->size); + chrp_nvram_create_free_partition(&nvram->buf[MIN_NVRAM_SIZE / 4], + nvram->size - MIN_NVRAM_SIZE / 4); + } +diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c +index 2aaa5bf1ae..cf2d0762d9 100644 +--- a/hw/sparc/sun4m.c ++++ b/hw/sparc/sun4m.c +@@ -142,7 +142,7 @@ static void nvram_init(Nvram *nvram, uint8_t *macaddr, + memset(image, '\0', sizeof(image)); + + /* OpenBIOS nvram variables partition */ +- sysp_end = chrp_nvram_create_system_partition(image, 0); ++ sysp_end = chrp_nvram_create_system_partition(image, 0, 0x1fd0); + + /* Free space partition */ + chrp_nvram_create_free_partition(&image[sysp_end], 0x1fd0 - sysp_end); +diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c +index 955082773b..f5295a687e 100644 +--- a/hw/sparc64/sun4u.c ++++ b/hw/sparc64/sun4u.c +@@ -137,7 +137,7 @@ static int sun4u_NVRAM_set_params(Nvram *nvram, uint16_t NVRAM_size, + memset(image, '\0', sizeof(image)); + + /* OpenBIOS nvram variables partition */ +- sysp_end = chrp_nvram_create_system_partition(image, 0); ++ sysp_end = chrp_nvram_create_system_partition(image, 0, 0x1fd0); + + /* Free space partition */ + chrp_nvram_create_free_partition(&image[sysp_end], 0x1fd0 - sysp_end); +diff --git a/include/hw/nvram/chrp_nvram.h b/include/hw/nvram/chrp_nvram.h +index 09941a9be4..4a0f5c21b8 100644 +--- a/include/hw/nvram/chrp_nvram.h ++++ b/include/hw/nvram/chrp_nvram.h +@@ -50,7 +50,8 @@ chrp_nvram_finish_partition(ChrpNvramPartHdr *header, uint32_t size) + header->checksum = sum & 0xff; + } + +-int chrp_nvram_create_system_partition(uint8_t *data, int min_len); ++/* chrp_nvram_create_system_partition() failure is fatal */ ++int chrp_nvram_create_system_partition(uint8_t *data, int min_len, int max_len); + int chrp_nvram_create_free_partition(uint8_t *data, int len); + + #endif +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch b/kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch new file mode 100755 index 0000000..270b926 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch @@ -0,0 +1,112 @@ +From e46aaac6f1ad67753face896e827ad1da920b9e5 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:47 -0400 +Subject: [PATCH 11/14] pc-bios/s390-ccw: Allow booting in case the first + virtio-blk disk is bad + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-8-thuth@redhat.com> +Patchwork-id: 98601 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 7/9] pc-bios/s390-ccw: Allow booting in case the first virtio-blk disk is bad +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +If you try to boot with two virtio-blk disks (without bootindex), and +only the second one is bootable, the s390-ccw bios currently stops at +the first disk and does not continue booting from the second one. This +is annoying - and all other major QEMU firmwares succeed to boot from +the second disk in this case, so we should do the same in the s390-ccw +bios, too. + +Reviewed-by: Cornelia Huck +Message-Id: <20200806105349.632-8-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 5dc739f343cd06ecb9b058294564ce7504856f3f) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 34 +++++++++++++++++++++++----------- + pc-bios/s390-ccw/main.c | 2 +- + 2 files changed, 24 insertions(+), 12 deletions(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index d13b7cbd15..e91ea719ff 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -289,11 +289,18 @@ static void ipl_eckd_cdl(void) + read_block(1, ipl2, "Cannot read IPL2 record at block 1"); + + mbr = &ipl2->mbr; +- IPL_assert(magic_match(mbr, ZIPL_MAGIC), "No zIPL section in IPL2 record."); +- IPL_assert(block_size_ok(mbr->blockptr.xeckd.bptr.size), +- "Bad block size in zIPL section of IPL2 record."); +- IPL_assert(mbr->dev_type == DEV_TYPE_ECKD, +- "Non-ECKD device type in zIPL section of IPL2 record."); ++ if (!magic_match(mbr, ZIPL_MAGIC)) { ++ sclp_print("No zIPL section in IPL2 record.\n"); ++ return; ++ } ++ if (!block_size_ok(mbr->blockptr.xeckd.bptr.size)) { ++ sclp_print("Bad block size in zIPL section of IPL2 record.\n"); ++ return; ++ } ++ if (!mbr->dev_type == DEV_TYPE_ECKD) { ++ sclp_print("Non-ECKD device type in zIPL section of IPL2 record.\n"); ++ return; ++ } + + /* save pointer to Boot Map Table */ + bmt_block_nr = eckd_block_num(&mbr->blockptr.xeckd.bptr.chs); +@@ -303,10 +310,14 @@ static void ipl_eckd_cdl(void) + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(2, vlbl, "Cannot read Volume Label at block 2"); +- IPL_assert(magic_match(vlbl->key, VOL1_MAGIC), +- "Invalid magic of volume label block"); +- IPL_assert(magic_match(vlbl->f.key, VOL1_MAGIC), +- "Invalid magic of volser block"); ++ if (!magic_match(vlbl->key, VOL1_MAGIC)) { ++ sclp_print("Invalid magic of volume label block.\n"); ++ return; ++ } ++ if (!magic_match(vlbl->f.key, VOL1_MAGIC)) { ++ sclp_print("Invalid magic of volser block.\n"); ++ return; ++ } + print_volser(vlbl->f.volser); + + run_eckd_boot_script(bmt_block_nr, s1b_block_nr); +@@ -400,7 +411,8 @@ static void ipl_eckd(void) + read_block(0, mbr, "Cannot read block 0 on DASD"); + + if (magic_match(mbr->magic, IPL1_MAGIC)) { +- ipl_eckd_cdl(); /* no return */ ++ ipl_eckd_cdl(); /* only returns in case of error */ ++ return; + } + + /* LDL/CMS? */ +@@ -827,5 +839,5 @@ void zipl_load(void) + panic("\n! Unknown IPL device type !\n"); + } + +- panic("\n* this can never happen *\n"); ++ sclp_print("zIPL load failed.\n"); + } +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 5c1c98341d..b5c721c395 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -249,7 +249,7 @@ static void ipl_boot_device(void) + break; + case CU_TYPE_VIRTIO: + if (virtio_setup() == 0) { +- zipl_load(); /* no return */ ++ zipl_load(); /* Only returns in case of errors */ + } + break; + default: +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch b/kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch new file mode 100755 index 0000000..4a295ca --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch @@ -0,0 +1,214 @@ +From 6f44767aeda52048e7c9ee4b5fcc30353c71cbc1 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:45 -0400 +Subject: [PATCH 09/14] pc-bios/s390-ccw: Do not bail out early if not finding + a SCSI disk + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-6-thuth@redhat.com> +Patchwork-id: 98599 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 5/9] pc-bios/s390-ccw: Do not bail out early if not finding a SCSI disk +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +In case the user did not specify a boot device, we want to continue +looking for other devices if there are no valid SCSI disks on a virtio- +scsi controller. As a first step, do not panic in this case and let +the control flow carry the error to the upper functions instead. + +Message-Id: <20200806105349.632-6-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Signed-off-by: Thomas Huth +(cherry picked from commit 605751b5a5334e187761b0b8a8266a216897bf70) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 14 ++++++++++---- + pc-bios/s390-ccw/s390-ccw.h | 2 +- + pc-bios/s390-ccw/virtio-blkdev.c | 7 +++++-- + pc-bios/s390-ccw/virtio-scsi.c | 28 ++++++++++++++++++++-------- + pc-bios/s390-ccw/virtio-scsi.h | 2 +- + 5 files changed, 37 insertions(+), 16 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index d6fd218074..456733fbee 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -227,7 +227,7 @@ static void find_boot_device(void) + IPL_assert(found, "Boot device not found\n"); + } + +-static void virtio_setup(void) ++static int virtio_setup(void) + { + VDev *vdev = virtio_get_device(); + QemuIplParameters *early_qipl = (QemuIplParameters *)QIPL_ADDRESS; +@@ -242,9 +242,14 @@ static void virtio_setup(void) + sclp_print("Network boot device detected\n"); + vdev->netboot_start_addr = qipl.netboot_start_addr; + } else { +- virtio_blk_setup_device(blk_schid); ++ int ret = virtio_blk_setup_device(blk_schid); ++ if (ret) { ++ return ret; ++ } + IPL_assert(virtio_ipl_disk_is_valid(), "No valid IPL device detected"); + } ++ ++ return 0; + } + + static void ipl_boot_device(void) +@@ -255,8 +260,9 @@ static void ipl_boot_device(void) + dasd_ipl(blk_schid, cutype); /* no return */ + break; + case CU_TYPE_VIRTIO: +- virtio_setup(); +- zipl_load(); /* no return */ ++ if (virtio_setup() == 0) { ++ zipl_load(); /* no return */ ++ } + break; + default: + print_int("Attempting to boot from unexpected device type", cutype); +diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h +index ae432c40b8..e7cf36eb91 100644 +--- a/pc-bios/s390-ccw/s390-ccw.h ++++ b/pc-bios/s390-ccw/s390-ccw.h +@@ -70,7 +70,7 @@ int sclp_read(char *str, size_t count); + unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2, + ulong subchan_id, void *load_addr); + bool virtio_is_supported(SubChannelId schid); +-void virtio_blk_setup_device(SubChannelId schid); ++int virtio_blk_setup_device(SubChannelId schid); + int virtio_read(ulong sector, void *load_addr); + u64 get_clock(void); + ulong get_second(void); +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index 11c56261ca..7d35050292 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -263,9 +263,10 @@ uint64_t virtio_get_blocks(void) + return 0; + } + +-void virtio_blk_setup_device(SubChannelId schid) ++int virtio_blk_setup_device(SubChannelId schid) + { + VDev *vdev = virtio_get_device(); ++ int ret = 0; + + vdev->schid = schid; + virtio_setup_ccw(vdev); +@@ -288,9 +289,11 @@ void virtio_blk_setup_device(SubChannelId schid) + "Config: CDB size mismatch"); + + sclp_print("Using virtio-scsi.\n"); +- virtio_scsi_setup(vdev); ++ ret = virtio_scsi_setup(vdev); + break; + default: + panic("\n! No IPL device available !\n"); + } ++ ++ return ret; + } +diff --git a/pc-bios/s390-ccw/virtio-scsi.c b/pc-bios/s390-ccw/virtio-scsi.c +index 4fe4b9d261..88691edb89 100644 +--- a/pc-bios/s390-ccw/virtio-scsi.c ++++ b/pc-bios/s390-ccw/virtio-scsi.c +@@ -192,7 +192,12 @@ static bool scsi_read_capacity(VDev *vdev, + + /* virtio-scsi routines */ + +-static void virtio_scsi_locate_device(VDev *vdev) ++/* ++ * Tries to locate a SCSI device and and adds the information for the found ++ * device to the vdev->scsi_device structure. ++ * Returns 0 if SCSI device could be located, or a error code < 0 otherwise ++ */ ++static int virtio_scsi_locate_device(VDev *vdev) + { + const uint16_t channel = 0; /* again, it's what QEMU does */ + uint16_t target; +@@ -218,7 +223,7 @@ static void virtio_scsi_locate_device(VDev *vdev) + IPL_check(sdev->channel == 0, "non-zero channel requested"); + IPL_check(sdev->target <= vdev->config.scsi.max_target, "target# high"); + IPL_check(sdev->lun <= vdev->config.scsi.max_lun, "LUN# high"); +- return; ++ return 0; + } + + for (target = 0; target <= vdev->config.scsi.max_target; target++) { +@@ -245,18 +250,20 @@ static void virtio_scsi_locate_device(VDev *vdev) + */ + sdev->lun = r->lun[0].v16[0]; /* it's returned this way */ + debug_print_int("Have to use LUN", sdev->lun); +- return; /* we have to use this device */ ++ return 0; /* we have to use this device */ + } + for (i = 0; i < luns; i++) { + if (r->lun[i].v64) { + /* Look for non-zero LUN - we have where to choose from */ + sdev->lun = r->lun[i].v16[0]; + debug_print_int("Will use LUN", sdev->lun); +- return; /* we have found a device */ ++ return 0; /* we have found a device */ + } + } + } +- panic("\n! Cannot locate virtio-scsi device !\n"); ++ ++ sclp_print("Warning: Could not locate a usable virtio-scsi device\n"); ++ return -ENODEV; + } + + int virtio_scsi_read_many(VDev *vdev, +@@ -320,17 +327,20 @@ static void scsi_parse_capacity_report(void *data, + } + } + +-void virtio_scsi_setup(VDev *vdev) ++int virtio_scsi_setup(VDev *vdev) + { + int retry_test_unit_ready = 3; + uint8_t data[256]; + uint32_t data_size = sizeof(data); + ScsiInquiryEvpdPages *evpd = &scsi_inquiry_evpd_pages_response; + ScsiInquiryEvpdBl *evpd_bl = &scsi_inquiry_evpd_bl_response; +- int i; ++ int i, ret; + + vdev->scsi_device = &default_scsi_device; +- virtio_scsi_locate_device(vdev); ++ ret = virtio_scsi_locate_device(vdev); ++ if (ret < 0) { ++ return ret; ++ } + + /* We have to "ping" the device before it becomes readable */ + while (!scsi_test_unit_ready(vdev)) { +@@ -415,4 +425,6 @@ void virtio_scsi_setup(VDev *vdev) + } + scsi_parse_capacity_report(data, &vdev->scsi_last_block, + (uint32_t *) &vdev->scsi_block_size); ++ ++ return 0; + } +diff --git a/pc-bios/s390-ccw/virtio-scsi.h b/pc-bios/s390-ccw/virtio-scsi.h +index 4c4f4bbc31..4b14c2c2f9 100644 +--- a/pc-bios/s390-ccw/virtio-scsi.h ++++ b/pc-bios/s390-ccw/virtio-scsi.h +@@ -67,7 +67,7 @@ static inline bool virtio_scsi_response_ok(const VirtioScsiCmdResp *r) + return r->response == VIRTIO_SCSI_S_OK && r->status == CDB_STATUS_GOOD; + } + +-void virtio_scsi_setup(VDev *vdev); ++int virtio_scsi_setup(VDev *vdev); + int virtio_scsi_read_many(VDev *vdev, + ulong sector, void *load_addr, int sec_num); + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch b/kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch new file mode 100755 index 0000000..4385267 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch @@ -0,0 +1,54 @@ +From 7b3a7cbfc5872e088f13e11f5c38dc5ac80c3330 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:43 -0400 +Subject: [PATCH 07/14] pc-bios/s390-ccw: Introduce ENODEV define and remove + guards of others + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-4-thuth@redhat.com> +Patchwork-id: 98597 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/9] pc-bios/s390-ccw: Introduce ENODEV define and remove guards of others +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Remove the "#ifndef E..." guards from the defines here - the header +guard S390_CCW_H at the top of the file should avoid double definition, +and if the error code is defined in a different file already, we're in +trouble anyway, then it's better to see the error at compile time instead +of hunting weird behavior during runtime later. +Also define ENODEV - we will use this in a later patch. + +Message-Id: <20200806105349.632-4-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Reviewed-by: Janosch Frank +Signed-off-by: Thomas Huth +(cherry picked from commit f3180b0266386b31deb7bb83fcaea68af7d1bcee) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/s390-ccw.h | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h +index 21f27e7990..ae432c40b8 100644 +--- a/pc-bios/s390-ccw/s390-ccw.h ++++ b/pc-bios/s390-ccw/s390-ccw.h +@@ -27,12 +27,10 @@ typedef unsigned long long __u64; + #define false 0 + #define PAGE_SIZE 4096 + +-#ifndef EIO + #define EIO 1 +-#endif +-#ifndef EBUSY + #define EBUSY 2 +-#endif ++#define ENODEV 3 ++ + #ifndef NULL + #define NULL 0 + #endif +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch b/kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch new file mode 100755 index 0000000..8f44646 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch @@ -0,0 +1,60 @@ +From eda3b6620e779ff89df46a0fb9022016bffd7f44 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:41 -0400 +Subject: [PATCH 05/14] pc-bios/s390-ccw/Makefile: Compile with -std=gnu99, + -fwrapv and -fno-common + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-2-thuth@redhat.com> +Patchwork-id: 98595 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/9] pc-bios/s390-ccw/Makefile: Compile with -std=gnu99, -fwrapv and -fno-common +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +The main QEMU code is compiled with -std=gnu99, -fwrapv and -fno-common. +We should use the same flags for the s390-ccw bios, too, to avoid that +we get different behavior with different compiler versions that changed +their default settings in the course of time (it happened at least with +-std=... and -fno-common in the past already). + +While we're at it, also group the other flags here in a little bit nicer +fashion: Move the two "-m" flags out of the "-f" area and specify them on +a separate line. + +Reviewed-by: Claudio Imbrenda +Acked-by: Cornelia Huck +Acked-by: Janosch Frank +Message-Id: <20200806105349.632-2-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 4f6a1eb886961f1f9da2d553c4b0e5ef69cd3801) +Conflicts: Simple contextual conflict due to meson reworks in upstream +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/Makefile | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile +index a048b6b077..e776a2a5ec 100644 +--- a/pc-bios/s390-ccw/Makefile ++++ b/pc-bios/s390-ccw/Makefile +@@ -13,10 +13,11 @@ OBJECTS = start.o main.o bootmap.o jump2ipl.o sclp.o menu.o \ + virtio.o virtio-scsi.o virtio-blkdev.o libc.o cio.o dasd-ipl.o + + QEMU_CFLAGS := $(filter -W%, $(QEMU_CFLAGS)) +-QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks -msoft-float +-QEMU_CFLAGS += -march=z900 -fPIE -fno-strict-aliasing +-QEMU_CFLAGS += -fno-asynchronous-unwind-tables ++QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks -fno-common -fPIE ++QEMU_CFLAGS += -fwrapv -fno-strict-aliasing -fno-asynchronous-unwind-tables + QEMU_CFLAGS += $(call cc-option, $(QEMU_CFLAGS), -fno-stack-protector) ++QEMU_CFLAGS += -msoft-float -march=z900 ++QEMU_CFLAGS += -std=gnu99 + LDFLAGS += -Wl,-pie -nostdlib + + build-all: s390-ccw.img s390-netboot.img +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch b/kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch new file mode 100755 index 0000000..bbeac9e --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch @@ -0,0 +1,72 @@ +From 740590240bec03dc6ca208963112d3c2999f353e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:42 -0400 +Subject: [PATCH 06/14] pc-bios/s390-ccw: Move ipl-related code from main() + into a separate function + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-3-thuth@redhat.com> +Patchwork-id: 98596 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/9] pc-bios/s390-ccw: Move ipl-related code from main() into a separate function +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Let's move this part of the code into a separate function to be able +to use it from multiple spots later. + +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Reviewed-by: Janosch Frank +Message-Id: <20200806105349.632-3-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit d1f060a8b515a0b1d14c38f2c8f86ab54e79c3dc) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 4e65b411e1..5e565be5b1 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -232,14 +232,8 @@ static void virtio_setup(void) + } + } + +-int main(void) ++static void ipl_boot_device(void) + { +- sclp_setup(); +- css_setup(); +- boot_setup(); +- find_boot_device(); +- enable_subchannel(blk_schid); +- + switch (cutype) { + case CU_TYPE_DASD_3990: + case CU_TYPE_DASD_2107: +@@ -251,8 +245,18 @@ int main(void) + break; + default: + print_int("Attempting to boot from unexpected device type", cutype); +- panic(""); ++ panic("\nBoot failed.\n"); + } ++} ++ ++int main(void) ++{ ++ sclp_setup(); ++ css_setup(); ++ boot_setup(); ++ find_boot_device(); ++ enable_subchannel(blk_schid); ++ ipl_boot_device(); + + panic("Failed to load OS from hard disk\n"); + return 0; /* make compiler happy */ +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch b/kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch new file mode 100755 index 0000000..3aa5dfd --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch @@ -0,0 +1,154 @@ +From d90cbb55fe3ec232091a24137cab45419aac8bc5 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:44 -0400 +Subject: [PATCH 08/14] pc-bios/s390-ccw: Move the inner logic of find_subch() + to a separate function + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-5-thuth@redhat.com> +Patchwork-id: 98598 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/9] pc-bios/s390-ccw: Move the inner logic of find_subch() to a separate function +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Move the code to a separate function to be able to re-use it from a +different spot later. + +Reviewed-by: Claudio Imbrenda +Message-Id: <20200806105349.632-5-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Reviewed-by: Janosch Frank +Signed-off-by: Thomas Huth +(cherry picked from commit d2cf4af1f4af02f6f2d5827d9a06c31690084d3b) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 99 ++++++++++++++++++++++++----------------- + 1 file changed, 57 insertions(+), 42 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 5e565be5b1..d6fd218074 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -60,6 +60,60 @@ unsigned int get_loadparm_index(void) + return atoui(loadparm_str); + } + ++static int is_dev_possibly_bootable(int dev_no, int sch_no) ++{ ++ bool is_virtio; ++ Schib schib; ++ int r; ++ ++ blk_schid.sch_no = sch_no; ++ r = stsch_err(blk_schid, &schib); ++ if (r == 3 || r == -EIO) { ++ return -ENODEV; ++ } ++ if (!schib.pmcw.dnv) { ++ return false; ++ } ++ ++ enable_subchannel(blk_schid); ++ cutype = cu_type(blk_schid); ++ ++ /* ++ * Note: we always have to run virtio_is_supported() here to make ++ * sure that the vdev.senseid data gets pre-initialized correctly ++ */ ++ is_virtio = virtio_is_supported(blk_schid); ++ ++ /* No specific devno given, just return whether the device is possibly bootable */ ++ if (dev_no < 0) { ++ switch (cutype) { ++ case CU_TYPE_VIRTIO: ++ if (is_virtio) { ++ /* ++ * Skip net devices since no IPLB is created and therefore ++ * no network bootloader has been loaded ++ */ ++ if (virtio_get_device_type() != VIRTIO_ID_NET) { ++ return true; ++ } ++ } ++ return false; ++ case CU_TYPE_DASD_3990: ++ case CU_TYPE_DASD_2107: ++ return true; ++ default: ++ return false; ++ } ++ } ++ ++ /* Caller asked for a specific devno */ ++ if (schib.pmcw.dev == dev_no) { ++ return true; ++ } ++ ++ return false; ++} ++ + /* + * Find the subchannel connected to the given device (dev_no) and fill in the + * subchannel information block (schib) with the connected subchannel's info. +@@ -71,53 +125,14 @@ unsigned int get_loadparm_index(void) + */ + static bool find_subch(int dev_no) + { +- Schib schib; + int i, r; +- bool is_virtio; + + for (i = 0; i < 0x10000; i++) { +- blk_schid.sch_no = i; +- r = stsch_err(blk_schid, &schib); +- if ((r == 3) || (r == -EIO)) { ++ r = is_dev_possibly_bootable(dev_no, i); ++ if (r < 0) { + break; + } +- if (!schib.pmcw.dnv) { +- continue; +- } +- +- enable_subchannel(blk_schid); +- cutype = cu_type(blk_schid); +- +- /* +- * Note: we always have to run virtio_is_supported() here to make +- * sure that the vdev.senseid data gets pre-initialized correctly +- */ +- is_virtio = virtio_is_supported(blk_schid); +- +- /* No specific devno given, just return 1st possibly bootable device */ +- if (dev_no < 0) { +- switch (cutype) { +- case CU_TYPE_VIRTIO: +- if (is_virtio) { +- /* +- * Skip net devices since no IPLB is created and therefore +- * no network bootloader has been loaded +- */ +- if (virtio_get_device_type() != VIRTIO_ID_NET) { +- return true; +- } +- } +- continue; +- case CU_TYPE_DASD_3990: +- case CU_TYPE_DASD_2107: +- return true; +- default: +- continue; +- } +- } +- +- /* Caller asked for a specific devno */ +- if (schib.pmcw.dev == dev_no) { ++ if (r == true) { + return true; + } + } +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch b/kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch new file mode 100755 index 0000000..c8e3017 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch @@ -0,0 +1,116 @@ +From 911dc631f9ab68c6acfd4b401fbcfaa3b58a4fb6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:46 -0400 +Subject: [PATCH 10/14] pc-bios/s390-ccw: Scan through all devices if no boot + device specified + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-7-thuth@redhat.com> +Patchwork-id: 98600 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 6/9] pc-bios/s390-ccw: Scan through all devices if no boot device specified +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +If no boot device has been specified (via "bootindex=..."), the s390-ccw +bios scans through all devices to find a bootable device. But so far, it +stops at the very first block device (including virtio-scsi controllers +without attached devices) that it finds, no matter whether it is bootable +or not. That leads to some weird situatation where it is e.g. possible +to boot via: + + qemu-system-s390x -hda /path/to/disk.qcow2 + +but not if there is e.g. a virtio-scsi controller specified before: + + qemu-system-s390x -device virtio-scsi -hda /path/to/disk.qcow2 + +While using "bootindex=..." is clearly the preferred way of booting +on s390x, we still can make the life for the users at least a little +bit easier if we look at all available devices to find a bootable one. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1846975 +Reviewed-by: Cornelia Huck +Message-Id: <20200806105349.632-7-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 869d0e2f593dd37297c366203f006b9acd1b7b45) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 46 +++++++++++++++++++++++++++-------------- + 1 file changed, 31 insertions(+), 15 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 456733fbee..5c1c98341d 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -191,20 +191,8 @@ static void boot_setup(void) + static void find_boot_device(void) + { + VDev *vdev = virtio_get_device(); +- int ssid; + bool found; + +- if (!have_iplb) { +- for (ssid = 0; ssid < 0x3; ssid++) { +- blk_schid.ssid = ssid; +- found = find_subch(-1); +- if (found) { +- return; +- } +- } +- panic("Could not find a suitable boot device (none specified)\n"); +- } +- + switch (iplb.pbt) { + case S390_IPL_TYPE_CCW: + debug_print_int("device no. ", iplb.ccw.devno); +@@ -270,14 +258,42 @@ static void ipl_boot_device(void) + } + } + ++/* ++ * No boot device has been specified, so we have to scan through the ++ * channels to find one. ++ */ ++static void probe_boot_device(void) ++{ ++ int ssid, sch_no, ret; ++ ++ for (ssid = 0; ssid < 0x3; ssid++) { ++ blk_schid.ssid = ssid; ++ for (sch_no = 0; sch_no < 0x10000; sch_no++) { ++ ret = is_dev_possibly_bootable(-1, sch_no); ++ if (ret < 0) { ++ break; ++ } ++ if (ret == true) { ++ ipl_boot_device(); /* Only returns if unsuccessful */ ++ } ++ } ++ } ++ ++ sclp_print("Could not find a suitable boot device (none specified)\n"); ++} ++ + int main(void) + { + sclp_setup(); + css_setup(); + boot_setup(); +- find_boot_device(); +- enable_subchannel(blk_schid); +- ipl_boot_device(); ++ if (have_iplb) { ++ find_boot_device(); ++ enable_subchannel(blk_schid); ++ ipl_boot_device(); ++ } else { ++ probe_boot_device(); ++ } + + panic("Failed to load OS from hard disk\n"); + return 0; /* make compiler happy */ +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch b/kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch new file mode 100755 index 0000000..414cc13 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch @@ -0,0 +1,50 @@ +From 56ae2d8a1ee3a35e2eed4f4baa61f97184189b47 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 18 May 2021 13:51:24 -0400 +Subject: [PATCH 4/5] pc-bios/s390-ccw: break loop if a null block number is + reached +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20210518135125.191329-3-thuth@redhat.com> +Patchwork-id: 101549 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/3] pc-bios/s390-ccw: break loop if a null block number is reached +Bugzilla: 1942880 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Break the loop if `cur_block_nr` is a null block number because this +means that the end of chunk is reached. In this case we will try to +boot the default entry. + +Fixes: ba831b25262a ("s390-ccw: read stage2 boot loader data to find menu") +Reviewed-by: Collin Walling +Signed-off-by: Marc Hartmayer +Message-Id: <20200924085926.21709-3-mhartmay@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 468184ec9024f4f7b55247f70ec57554e8a500d7) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index bb6e003270..624f524331 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -192,7 +192,7 @@ static int eckd_get_boot_menu_index(block_number_t s1b_block_nr) + for (i = 0; i < STAGE2_BLK_CNT_MAX; i++) { + cur_block_nr = eckd_block_num(&s1b->seek[i].chs); + +- if (!cur_block_nr) { ++ if (!cur_block_nr || is_null_block_number(cur_block_nr)) { + break; + } + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch b/kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch new file mode 100755 index 0000000..2597118 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch @@ -0,0 +1,48 @@ +From 52ba1903b2c8ce69e8cd1de2a78c2c63cc60383b Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 18 May 2021 13:51:25 -0400 +Subject: [PATCH 5/5] pc-bios/s390-ccw: don't try to read the next block if end + of chunk is reached +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20210518135125.191329-4-thuth@redhat.com> +Patchwork-id: 101550 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 3/3] pc-bios/s390-ccw: don't try to read the next block if end of chunk is reached +Bugzilla: 1942880 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Don't read the block if a null block number is reached, because this means that +the end of chunk is reached. + +Reviewed-by: Collin Walling +Signed-off-by: Marc Hartmayer +Message-Id: <20210416074736.17409-1-mhartmay@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit a6625d38cce3901a7c1cba069f0abcf743a293f1) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index 624f524331..8458b15cb6 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -212,7 +212,7 @@ static int eckd_get_boot_menu_index(block_number_t s1b_block_nr) + next_block_nr = eckd_block_num(&s1b->seek[i + 1].chs); + } + +- if (next_block_nr) { ++ if (next_block_nr && !is_null_block_number(next_block_nr)) { + read_block(next_block_nr, s2_next_blk, + "Cannot read stage2 boot loader"); + } +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch b/kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch new file mode 100755 index 0000000..691bed4 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch @@ -0,0 +1,51 @@ +From 0e9bdb960045f98d70f765bbb585f1647e5fea08 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 18 May 2021 13:51:23 -0400 +Subject: [PATCH 3/5] pc-bios/s390-ccw: fix off-by-one error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20210518135125.191329-2-thuth@redhat.com> +Patchwork-id: 101548 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/3] pc-bios/s390-ccw: fix off-by-one error +Bugzilla: 1942880 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +This error takes effect when the magic value "zIPL" is located at the +end of a block. For example if s2_cur_blk = 0x7fe18000 and the magic +value "zIPL" is located at 0x7fe18ffc - 0x7fe18fff. + +Fixes: ba831b25262a ("s390-ccw: read stage2 boot loader data to find menu") +Reviewed-by: Collin Walling +Signed-off-by: Marc Hartmayer +Message-Id: <20200924085926.21709-2-mhartmay@linux.ibm.com> +Reviewed-by: Thomas Huth +[thuth: Use "<= ... - 4" instead of "< ... - 3"] +Signed-off-by: Thomas Huth +(cherry picked from commit 5f97ba0c74ccace0a4014460de9751ff3c6f454a) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index e91ea719ff..bb6e003270 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -163,7 +163,7 @@ static bool find_zipl_boot_menu_banner(int *offset) + int i; + + /* Menu banner starts with "zIPL" */ +- for (i = 0; i < virtio_get_block_size() - 4; i++) { ++ for (i = 0; i <= virtio_get_block_size() - 4; i++) { + if (magic_match(s2_cur_blk + i, ZIPL_MAGIC_EBCDIC)) { + *offset = i; + return true; +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch b/kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch new file mode 100755 index 0000000..cf1466a --- /dev/null +++ b/kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch @@ -0,0 +1,43 @@ +From 541d06b7dc1cd3ad4722850f3a7f5df12b8d6fba Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:48 -0400 +Subject: [PATCH 12/14] pc-bios/s390-ccw/main: Remove superfluous call to + enable_subchannel() + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-9-thuth@redhat.com> +Patchwork-id: 98602 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 8/9] pc-bios/s390-ccw/main: Remove superfluous call to enable_subchannel() +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +enable_subchannel() is already done during is_dev_possibly_bootable() +(which is called from find_boot_device() -> find_subch()), so there +is no need to do this again in the main() function. + +Message-Id: <20200806105349.632-9-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Signed-off-by: Thomas Huth +(cherry picked from commit 49d4388ec03fd8c7701b907a4e11c437a28f8572) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index b5c721c395..e3a1a3053d 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -289,7 +289,6 @@ int main(void) + boot_setup(); + if (have_iplb) { + find_boot_device(); +- enable_subchannel(blk_schid); + ipl_boot_device(); + } else { + probe_boot_device(); +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch b/kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch new file mode 100755 index 0000000..8334b7b --- /dev/null +++ b/kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch @@ -0,0 +1,87 @@ +From c6f62870f27ece45e944d1818f6aa04b3e024959 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:41 -0500 +Subject: [PATCH 5/5] pc-bios: s390x: Clear out leftover S390EP string + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-5-thuth@redhat.com> +Patchwork-id: 100369 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/4] pc-bios: s390x: Clear out leftover S390EP string +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Eric Farman + +A Linux binary will have the string "S390EP" at address 0x10008, +which is important in getting the guest up off the ground. In the +case of a reboot (specifically chreipl going to a new device), +we should defer to the PSW at address zero for the new config, +which will re-write "S390EP" from the new image. + +Let's clear it out at this point so that a reipl to, say, a DASD +passthrough device drives the IPL path from scratch without disrupting +disrupting the order of operations for other boots. + +Rather than hardcoding the address of this magic (again), let's +define it somewhere so that the two users are visibly related. + +Signed-off-by: Eric Farman +Message-Id: <20201120160117.59366-3-farman@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 3d6519968bb10260fc724c491fb4275f7c0b78ac) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 2 +- + pc-bios/s390-ccw/main.c | 6 ++++++ + pc-bios/s390-ccw/s390-arch.h | 3 +++ + 3 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index 767012bf0c9..6c6823b5db8 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -78,7 +78,7 @@ void jump_to_low_kernel(void) + * kernel start address (when jumping to the PSW-at-zero address instead, + * the kernel startup code fails when we booted from a network device). + */ +- if (!memcmp((char *)0x10008, "S390EP", 6)) { ++ if (!memcmp((char *)S390EP, "S390EP", 6)) { + jump_to_IPL_code(KERN_IMAGE_START); + } + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index e3a1a3053d0..c04b910082b 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -185,6 +185,12 @@ static void boot_setup(void) + memcpy(lpmsg + 10, loadparm_str, 8); + sclp_print(lpmsg); + ++ /* ++ * Clear out any potential S390EP magic (see jump_to_low_kernel()), ++ * so we don't taint our decision-making process during a reboot. ++ */ ++ memset((char *)S390EP, 0, 6); ++ + have_iplb = store_iplb(&iplb); + } + +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 6da44d4436c..a741488aaa1 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -95,6 +95,9 @@ typedef struct LowCore { + + extern LowCore *lowcore; + ++/* Location of "S390EP" in a Linux binary (see arch/s390/boot/head.S) */ ++#define S390EP 0x10008 ++ + static inline void set_prefix(uint32_t address) + { + asm volatile("spx %0" : : "m" (address) : "memory"); +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch b/kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch new file mode 100755 index 0000000..9d09be3 --- /dev/null +++ b/kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch @@ -0,0 +1,63 @@ +From 6b19062226ecebf63d2d0b0ff05b5bcfa7a05818 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:40 -0500 +Subject: [PATCH 4/5] pc-bios: s390x: Ensure Read IPL memory is clean + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-4-thuth@redhat.com> +Patchwork-id: 100372 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/4] pc-bios: s390x: Ensure Read IPL memory is clean +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Eric Farman + +If, for example, we boot off a virtio device and chreipl to a vfio-ccw +device, the space at lowcore will be non-zero. We build a Read IPL CCW +at address zero, but it will have leftover PSW data that will conflict +with the Format-0 CCW being generated: + +0x0: 00080000 80010000 + ------ Ccw0.cda + -- Ccw0.chainData + -- Reserved bits + +The data address will be overwritten with the correct value (0x0), but +the apparent data chain bit will cause subsequent memory to be used as +the target of the data store, which may not be where we expect (0x0). + +Clear out this space when we boot from DASD, so that we know it exists +exactly as we expect. + +Signed-off-by: Eric Farman +Reviewed-by: Jason J. Herne +Reviewed-by: Janosch Frank +Acked-by: Christian Borntraeger +Acked-by: Cornelia Huck +Message-Id: <20201120160117.59366-2-farman@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit d8e5bbdd0d6fa8d9b5ac15de62c87105d92ff558) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/dasd-ipl.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/pc-bios/s390-ccw/dasd-ipl.c b/pc-bios/s390-ccw/dasd-ipl.c +index 0fc879bb8e8..71cbae2f16e 100644 +--- a/pc-bios/s390-ccw/dasd-ipl.c ++++ b/pc-bios/s390-ccw/dasd-ipl.c +@@ -100,6 +100,9 @@ static void make_readipl(void) + { + Ccw0 *ccwIplRead = (Ccw0 *)0x00; + ++ /* Clear out any existing data */ ++ memset(ccwIplRead, 0, sizeof(Ccw0)); ++ + /* Create Read IPL ccw at address 0 */ + ccwIplRead->cmd_code = CCW_CMD_READ_IPL; + ccwIplRead->cda = 0x00; /* Read into address 0x00 in main memory */ +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Fix-reset-psw-mask.patch b/kvm-pc-bios-s390x-Fix-reset-psw-mask.patch new file mode 100755 index 0000000..9c45e92 --- /dev/null +++ b/kvm-pc-bios-s390x-Fix-reset-psw-mask.patch @@ -0,0 +1,75 @@ +From 38ba55dd27a3b8308f0ce2e82a4c3eba3f197d20 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:53 -0400 +Subject: [PATCH 11/42] pc-bios/s390x: Fix reset psw mask + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-12-thuth@redhat.com> +Patchwork-id: 97034 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 11/38] pc-bios/s390x: Fix reset psw mask +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +We need to set the short psw indication bit in the reset psw, as it is +a short psw. + +Exposed by "s390x: Properly fetch and test the short psw on diag308 +subc 0/1". + +Fixes: 962982329029 ("pc-bios/s390-ccw: do a subsystem reset before running the guest") +Signed-off-by: Janosch Frank +Message-Id: <20191203132813.2734-5-frankja@linux.ibm.com> +Acked-by: Christian Borntraeger +Signed-off-by: Cornelia Huck +(cherry picked from commit 5c6f0d5f46a77d77460dfb518cf1e1e4145c276e) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index 266f1502b9..da13c43cc0 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -12,11 +12,11 @@ + #define KERN_IMAGE_START 0x010000UL + #define PSW_MASK_64 0x0000000100000000ULL + #define PSW_MASK_32 0x0000000080000000ULL +-#define IPL_PSW_MASK (PSW_MASK_32 | PSW_MASK_64) ++#define PSW_MASK_SHORTPSW 0x0008000000000000ULL ++#define RESET_PSW_MASK (PSW_MASK_SHORTPSW | PSW_MASK_32 | PSW_MASK_64) + + typedef struct ResetInfo { +- uint32_t ipl_mask; +- uint32_t ipl_addr; ++ uint64_t ipl_psw; + uint32_t ipl_continue; + } ResetInfo; + +@@ -50,7 +50,9 @@ void jump_to_IPL_code(uint64_t address) + ResetInfo *current = 0; + + save = *current; +- current->ipl_addr = (uint32_t) (uint64_t) &jump_to_IPL_2; ++ ++ current->ipl_psw = (uint64_t) &jump_to_IPL_2; ++ current->ipl_psw |= RESET_PSW_MASK; + current->ipl_continue = address & 0x7fffffff; + + debug_print_int("set IPL addr to", current->ipl_continue); +@@ -82,7 +84,7 @@ void jump_to_low_kernel(void) + } + + /* Trying to get PSW at zero address */ +- if (*((uint64_t *)0) & IPL_PSW_MASK) { ++ if (*((uint64_t *)0) & RESET_PSW_MASK) { + jump_to_IPL_code((*((uint64_t *)0)) & 0x7fffffff); + } + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch b/kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch new file mode 100755 index 0000000..8ba4530 --- /dev/null +++ b/kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch @@ -0,0 +1,45 @@ +From 494ce6ed658a806af36d4f50600e44740a446011 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:38 -0500 +Subject: [PATCH 2/5] pc-bios: s390x: Rename PSW_MASK_ZMODE to PSW_MASK_64 + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-2-thuth@redhat.com> +Patchwork-id: 100370 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/4] pc-bios: s390x: Rename PSW_MASK_ZMODE to PSW_MASK_64 +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +This constant enables 64 bit addressing, not the ESAME architecture, +so it shouldn't be named ZMODE. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Message-Id: <20200624075226.92728-7-frankja@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit b88faa1c899db2fae8b5b168aeb6c47bef090f27) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/s390-arch.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 5f36361c022..73852029d4e 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -29,7 +29,7 @@ _Static_assert(sizeof(struct PSWLegacy) == 8, "PSWLegacy size incorrect"); + #define PSW_MASK_WAIT 0x0002000000000000ULL + #define PSW_MASK_EAMODE 0x0000000100000000ULL + #define PSW_MASK_BAMODE 0x0000000080000000ULL +-#define PSW_MASK_ZMODE (PSW_MASK_EAMODE | PSW_MASK_BAMODE) ++#define PSW_MASK_64 (PSW_MASK_EAMODE | PSW_MASK_BAMODE) + + /* Low core mapping */ + typedef struct LowCore { +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch b/kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch new file mode 100755 index 0000000..2db2f93 --- /dev/null +++ b/kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch @@ -0,0 +1,145 @@ +From 8350ad9c0f54519a06ec396c2997330615f4b470 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:58 -0400 +Subject: [PATCH 16/42] pc-bios: s390x: Save iplb location in lowcore + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-17-thuth@redhat.com> +Patchwork-id: 97027 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 16/38] pc-bios: s390x: Save iplb location in lowcore +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The POP states that for a list directed IPL the IPLB is stored into +memory by the machine loader and its address is stored at offset 0x14 +of the lowcore. + +ZIPL currently uses the address in offset 0x14 to access the IPLB and +acquire flags about secure boot. If the IPLB address points into +memory which has an unsupported mix of flags set, ZIPL will panic +instead of booting the OS. + +As the lowcore can have quite a high entropy for a guest that did drop +out of protected mode (i.e. rebooted) we encountered the ZIPL panic +quite often. + +Signed-off-by: Janosch Frank +Tested-by: Marc Hartmayer +Message-Id: <20200304114231.23493-19-frankja@linux.ibm.com> +Reviewed-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Signed-off-by: Christian Borntraeger +(cherry picked from commit 9bfc04f9ef6802fff0fc77130ff345a541783363) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 1 + + pc-bios/s390-ccw/main.c | 8 +++++++- + pc-bios/s390-ccw/netmain.c | 1 + + pc-bios/s390-ccw/s390-arch.h | 10 ++++++++-- + pc-bios/s390-ccw/s390-ccw.h | 1 + + 5 files changed, 18 insertions(+), 3 deletions(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index da13c43cc0..4eba2510b0 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -35,6 +35,7 @@ void jump_to_IPL_code(uint64_t address) + { + /* store the subsystem information _after_ the bootmap was loaded */ + write_subsystem_identification(); ++ write_iplb_location(); + + /* prevent unknown IPL types in the guest */ + if (iplb.pbt == S390_IPL_TYPE_QEMU_SCSI) { +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index a21b386280..4e65b411e1 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -9,6 +9,7 @@ + */ + + #include "libc.h" ++#include "helper.h" + #include "s390-arch.h" + #include "s390-ccw.h" + #include "cio.h" +@@ -22,7 +23,7 @@ QemuIplParameters qipl; + IplParameterBlock iplb __attribute__((__aligned__(PAGE_SIZE))); + static bool have_iplb; + static uint16_t cutype; +-LowCore const *lowcore; /* Yes, this *is* a pointer to address 0 */ ++LowCore *lowcore; /* Yes, this *is* a pointer to address 0 */ + + #define LOADPARM_PROMPT "PROMPT " + #define LOADPARM_EMPTY " " +@@ -42,6 +43,11 @@ void write_subsystem_identification(void) + *zeroes = 0; + } + ++void write_iplb_location(void) ++{ ++ lowcore->ptr_iplb = ptr2u32(&iplb); ++} ++ + void panic(const char *string) + { + sclp_print(string); +diff --git a/pc-bios/s390-ccw/netmain.c b/pc-bios/s390-ccw/netmain.c +index f2dcc01e27..309ffa30d9 100644 +--- a/pc-bios/s390-ccw/netmain.c ++++ b/pc-bios/s390-ccw/netmain.c +@@ -40,6 +40,7 @@ + #define DEFAULT_TFTP_RETRIES 20 + + extern char _start[]; ++void write_iplb_location(void) {} + + #define KERNEL_ADDR ((void *)0L) + #define KERNEL_MAX_SIZE ((long)_start) +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 504fc7c2f0..5f36361c02 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -36,7 +36,13 @@ typedef struct LowCore { + /* prefix area: defined by architecture */ + PSWLegacy ipl_psw; /* 0x000 */ + uint32_t ccw1[2]; /* 0x008 */ +- uint32_t ccw2[2]; /* 0x010 */ ++ union { ++ uint32_t ccw2[2]; /* 0x010 */ ++ struct { ++ uint32_t reserved10; ++ uint32_t ptr_iplb; ++ }; ++ }; + uint8_t pad1[0x80 - 0x18]; /* 0x018 */ + uint32_t ext_params; /* 0x080 */ + uint16_t cpu_addr; /* 0x084 */ +@@ -85,7 +91,7 @@ typedef struct LowCore { + PSW io_new_psw; /* 0x1f0 */ + } __attribute__((packed, aligned(8192))) LowCore; + +-extern LowCore const *lowcore; ++extern LowCore *lowcore; + + static inline void set_prefix(uint32_t address) + { +diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h +index 11bce7d73c..21f27e7990 100644 +--- a/pc-bios/s390-ccw/s390-ccw.h ++++ b/pc-bios/s390-ccw/s390-ccw.h +@@ -57,6 +57,7 @@ void consume_io_int(void); + /* main.c */ + void panic(const char *string); + void write_subsystem_identification(void); ++void write_iplb_location(void); + extern char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE))); + unsigned int get_loadparm_index(void); + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch b/kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch new file mode 100755 index 0000000..576447d --- /dev/null +++ b/kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch @@ -0,0 +1,89 @@ +From 35891c9334058c02f3ee83eee1a986802387c18b Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:39 -0500 +Subject: [PATCH 3/5] pc-bios: s390x: Use PSW masks where possible and + introduce PSW_MASK_SHORT_ADDR + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-3-thuth@redhat.com> +Patchwork-id: 100371 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/4] pc-bios: s390x: Use PSW masks where possible and introduce PSW_MASK_SHORT_ADDR +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's move some of the PSW mask defines into s390-arch.h and use them +in jump2ipl.c. Also let's introduce a new constant for the address +mask of 8 byte (short) PSWs. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Thomas Huth +Message-Id: <20200624075226.92728-8-frankja@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit fe75c657b8ee962da79f5d3518b139e26dc69c24) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 10 ++++------ + pc-bios/s390-ccw/s390-arch.h | 2 ++ + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index 4eba2510b04..767012bf0c9 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -8,12 +8,10 @@ + + #include "libc.h" + #include "s390-ccw.h" ++#include "s390-arch.h" + + #define KERN_IMAGE_START 0x010000UL +-#define PSW_MASK_64 0x0000000100000000ULL +-#define PSW_MASK_32 0x0000000080000000ULL +-#define PSW_MASK_SHORTPSW 0x0008000000000000ULL +-#define RESET_PSW_MASK (PSW_MASK_SHORTPSW | PSW_MASK_32 | PSW_MASK_64) ++#define RESET_PSW_MASK (PSW_MASK_SHORTPSW | PSW_MASK_64) + + typedef struct ResetInfo { + uint64_t ipl_psw; +@@ -54,7 +52,7 @@ void jump_to_IPL_code(uint64_t address) + + current->ipl_psw = (uint64_t) &jump_to_IPL_2; + current->ipl_psw |= RESET_PSW_MASK; +- current->ipl_continue = address & 0x7fffffff; ++ current->ipl_continue = address & PSW_MASK_SHORT_ADDR; + + debug_print_int("set IPL addr to", current->ipl_continue); + +@@ -86,7 +84,7 @@ void jump_to_low_kernel(void) + + /* Trying to get PSW at zero address */ + if (*((uint64_t *)0) & RESET_PSW_MASK) { +- jump_to_IPL_code((*((uint64_t *)0)) & 0x7fffffff); ++ jump_to_IPL_code((*((uint64_t *)0)) & PSW_MASK_SHORT_ADDR); + } + + /* No other option left, so use the Linux kernel start address */ +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 73852029d4e..6da44d4436c 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -26,9 +26,11 @@ _Static_assert(sizeof(struct PSWLegacy) == 8, "PSWLegacy size incorrect"); + + /* s390 psw bit masks */ + #define PSW_MASK_IOINT 0x0200000000000000ULL ++#define PSW_MASK_SHORTPSW 0x0008000000000000ULL + #define PSW_MASK_WAIT 0x0002000000000000ULL + #define PSW_MASK_EAMODE 0x0000000100000000ULL + #define PSW_MASK_BAMODE 0x0000000080000000ULL ++#define PSW_MASK_SHORT_ADDR 0x000000007fffffffULL + #define PSW_MASK_64 (PSW_MASK_EAMODE | PSW_MASK_BAMODE) + + /* Low core mapping */ +-- +2.27.0 + diff --git a/kvm-pcie_root_port-Add-hotplug-disabling-option.patch b/kvm-pcie_root_port-Add-hotplug-disabling-option.patch new file mode 100755 index 0000000..57f3c3b --- /dev/null +++ b/kvm-pcie_root_port-Add-hotplug-disabling-option.patch @@ -0,0 +1,153 @@ +From 8587278a20283851081d4d282d11ef6bafd17dc2 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Tue, 17 Mar 2020 13:56:39 -0400 +Subject: [PATCH 1/2] pcie_root_port: Add hotplug disabling option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Julia Suvorova +Message-id: <20200317135639.65085-1-jusual@redhat.com> +Patchwork-id: 94367 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] pcie_root_port: Add hotplug disabling option +Bugzilla: 1790899 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Peter Xu + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1790899 +BRANCH: rhel-av-8.2.1 +UPSTREAM: merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27302449 + +Make hot-plug/hot-unplug on PCIe Root Ports optional to allow libvirt +manage it and restrict unplug for the whole machine. This is going to +prevent user-initiated unplug in guests (Windows mostly). +Hotplug is enabled by default. +Usage: + -device pcie-root-port,hotplug=off,... + +If you want to disable hot-unplug on some downstream ports of one +switch, disable hot-unplug on PCIe Root Port connected to the upstream +port as well as on the selected downstream ports. + +Discussion related: + https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg00530.html + +Signed-off-by: Julia Suvorova +Message-Id: <20200226174607.205941-1-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Ján Tomko +(cherry picked from commit 530a0963184e57e71a5b538e9161f115df533e96) +Signed-off-by: Jon Maloy +--- + hw/pci-bridge/pcie_root_port.c | 2 +- + hw/pci-bridge/xio3130_downstream.c | 2 +- + hw/pci/pcie.c | 11 +++++++---- + hw/pci/pcie_port.c | 1 + + include/hw/pci/pcie.h | 2 +- + include/hw/pci/pcie_port.h | 3 +++ + 6 files changed, 14 insertions(+), 7 deletions(-) + +diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c +index 012c2cb12c..db80e2ec23 100644 +--- a/hw/pci-bridge/pcie_root_port.c ++++ b/hw/pci-bridge/pcie_root_port.c +@@ -94,7 +94,7 @@ static void rp_realize(PCIDevice *d, Error **errp) + + pcie_cap_arifwd_init(d); + pcie_cap_deverr_init(d); +- pcie_cap_slot_init(d, s->slot); ++ pcie_cap_slot_init(d, s); + pcie_cap_root_init(d); + + pcie_chassis_create(s->chassis); +diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c +index a9f084b863..4489ce4a40 100644 +--- a/hw/pci-bridge/xio3130_downstream.c ++++ b/hw/pci-bridge/xio3130_downstream.c +@@ -94,7 +94,7 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp) + } + pcie_cap_flr_init(d); + pcie_cap_deverr_init(d); +- pcie_cap_slot_init(d, s->slot); ++ pcie_cap_slot_init(d, s); + pcie_cap_arifwd_init(d); + + pcie_chassis_create(s->chassis); +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 08718188bb..0eb3a2a5d2 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -495,7 +495,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + + /* pci express slot for pci express root/downstream port + PCI express capability slot registers */ +-void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot) ++void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s) + { + uint32_t pos = dev->exp.exp_cap; + +@@ -505,13 +505,16 @@ void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot) + pci_long_test_and_clear_mask(dev->config + pos + PCI_EXP_SLTCAP, + ~PCI_EXP_SLTCAP_PSN); + pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, +- (slot << PCI_EXP_SLTCAP_PSN_SHIFT) | ++ (s->slot << PCI_EXP_SLTCAP_PSN_SHIFT) | + PCI_EXP_SLTCAP_EIP | +- PCI_EXP_SLTCAP_HPS | +- PCI_EXP_SLTCAP_HPC | + PCI_EXP_SLTCAP_PIP | + PCI_EXP_SLTCAP_AIP | + PCI_EXP_SLTCAP_ABP); ++ if (s->hotplug) { ++ pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, ++ PCI_EXP_SLTCAP_HPS | ++ PCI_EXP_SLTCAP_HPC); ++ } + + if (dev->cap_present & QEMU_PCIE_SLTCAP_PCP) { + pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, +diff --git a/hw/pci/pcie_port.c b/hw/pci/pcie_port.c +index c19a9be592..36dac33d98 100644 +--- a/hw/pci/pcie_port.c ++++ b/hw/pci/pcie_port.c +@@ -147,6 +147,7 @@ static const TypeInfo pcie_port_type_info = { + static Property pcie_slot_props[] = { + DEFINE_PROP_UINT8("chassis", PCIESlot, chassis, 0), + DEFINE_PROP_UINT16("slot", PCIESlot, slot, 0), ++ DEFINE_PROP_BOOL("hotplug", PCIESlot, hotplug, true), + DEFINE_PROP_END_OF_LIST() + }; + +diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h +index 7064875835..14c58ebdb6 100644 +--- a/include/hw/pci/pcie.h ++++ b/include/hw/pci/pcie.h +@@ -104,7 +104,7 @@ void pcie_cap_deverr_reset(PCIDevice *dev); + void pcie_cap_lnkctl_init(PCIDevice *dev); + void pcie_cap_lnkctl_reset(PCIDevice *dev); + +-void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot); ++void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s); + void pcie_cap_slot_reset(PCIDevice *dev); + void pcie_cap_slot_get(PCIDevice *dev, uint16_t *slt_ctl, uint16_t *slt_sta); + void pcie_cap_slot_write_config(PCIDevice *dev, +diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h +index 7515430087..7072cc8731 100644 +--- a/include/hw/pci/pcie_port.h ++++ b/include/hw/pci/pcie_port.h +@@ -55,6 +55,9 @@ struct PCIESlot { + + /* Disable ACS (really for a pcie_root_port) */ + bool disable_acs; ++ ++ /* Indicates whether hot-plug is enabled on the slot */ ++ bool hotplug; + QLIST_ENTRY(PCIESlot) next; + }; + +-- +2.18.2 + diff --git a/kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch b/kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch new file mode 100755 index 0000000..8c33334 --- /dev/null +++ b/kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch @@ -0,0 +1,54 @@ +From b36a9259e085b4d32532d896e485889181b130ae Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:45 -0400 +Subject: [PATCH 7/9] pcnet: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-8-jmaloy@redhat.com> +Patchwork-id: 101791 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 7/9] pcnet: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Buglink: https://bugs.launchpad.net/qemu/+bug/1917085 +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 99ccfaa1edafd79f7a3a0ff7b58ae4da7c514928) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/pcnet.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c +index f3f18d8598..dcd3fc4948 100644 +--- a/hw/net/pcnet.c ++++ b/hw/net/pcnet.c +@@ -1250,7 +1250,7 @@ txagain: + if (BCR_SWSTYLE(s) == 1) + add_crc = !GET_FIELD(tmd.status, TMDS, NOFCS); + s->looptest = add_crc ? PCNET_LOOPTEST_CRC : PCNET_LOOPTEST_NOCRC; +- pcnet_receive(qemu_get_queue(s->nic), s->buffer, s->xmit_pos); ++ qemu_receive_packet(qemu_get_queue(s->nic), s->buffer, s->xmit_pos); + s->looptest = 0; + } else { + if (s->nic) { +-- +2.27.0 + diff --git a/kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch b/kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch new file mode 100755 index 0000000..2dbdb16 --- /dev/null +++ b/kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch @@ -0,0 +1,107 @@ +From 22fc9bd7e7ae0b72c6f6e483eb66cf996f519766 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 21 Jan 2020 05:16:11 +0000 +Subject: [PATCH 01/15] ppc: Deassert the external interrupt pin in KVM on + reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200121051613.388295-2-dgibson@redhat.com> +Patchwork-id: 93429 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 1/3] ppc: Deassert the external interrupt pin in KVM on reset +Bugzilla: 1776638 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +When a CPU is reset, QEMU makes sure no interrupt is pending by clearing +CPUPPCstate::pending_interrupts in ppc_cpu_reset(). In the case of a +complete machine emulation, eg. a sPAPR machine, an external interrupt +request could still be pending in KVM though, eg. an IPI. It will be +eventually presented to the guest, which is supposed to acknowledge it at +the interrupt controller. If the interrupt controller is emulated in QEMU, +either XICS or XIVE, ppc_set_irq() won't deassert the external interrupt +pin in KVM since it isn't pending anymore for QEMU. When the vCPU re-enters +the guest, the interrupt request is still pending and the vCPU will try +again to acknowledge it. This causes an infinite loop and eventually hangs +the guest. + +The code has been broken since the beginning. The issue wasn't hit before +because accel=kvm,kernel-irqchip=off is an awkward setup that never got +used until recently with the LC92x IBM systems (aka, Boston). + +Add a ppc_irq_reset() function to do the necessary cleanup, ie. deassert +the IRQ pins of the CPU in QEMU and most importantly the external interrupt +pin for this vCPU in KVM. + +Reported-by: Satheesh Rajendran +Signed-off-by: Greg Kurz +Message-Id: <157548861740.3650476.16879693165328764758.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 401774387aeb37f2ada9bb18f7c7e307b21a3e93) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1776638 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/ppc.c | 8 ++++++++ + include/hw/ppc/ppc.h | 2 ++ + target/ppc/translate_init.inc.c | 1 + + 3 files changed, 11 insertions(+) + +diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c +index 52a18eb..d554b64 100644 +--- a/hw/ppc/ppc.c ++++ b/hw/ppc/ppc.c +@@ -1510,3 +1510,11 @@ PowerPCCPU *ppc_get_vcpu_by_pir(int pir) + + return NULL; + } ++ ++void ppc_irq_reset(PowerPCCPU *cpu) ++{ ++ CPUPPCState *env = &cpu->env; ++ ++ env->irq_input_state = 0; ++ kvmppc_set_interrupt(cpu, PPC_INTERRUPT_EXT, 0); ++} +diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h +index 4bdcb8b..5dd7531 100644 +--- a/include/hw/ppc/ppc.h ++++ b/include/hw/ppc/ppc.h +@@ -76,6 +76,7 @@ static inline void ppc970_irq_init(PowerPCCPU *cpu) {} + static inline void ppcPOWER7_irq_init(PowerPCCPU *cpu) {} + static inline void ppcPOWER9_irq_init(PowerPCCPU *cpu) {} + static inline void ppce500_irq_init(PowerPCCPU *cpu) {} ++static inline void ppc_irq_reset(PowerPCCPU *cpu) {} + #else + void ppc40x_irq_init(PowerPCCPU *cpu); + void ppce500_irq_init(PowerPCCPU *cpu); +@@ -83,6 +84,7 @@ void ppc6xx_irq_init(PowerPCCPU *cpu); + void ppc970_irq_init(PowerPCCPU *cpu); + void ppcPOWER7_irq_init(PowerPCCPU *cpu); + void ppcPOWER9_irq_init(PowerPCCPU *cpu); ++void ppc_irq_reset(PowerPCCPU *cpu); + #endif + + /* PPC machines for OpenBIOS */ +diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c +index ba726de..64a8380 100644 +--- a/target/ppc/translate_init.inc.c ++++ b/target/ppc/translate_init.inc.c +@@ -10461,6 +10461,7 @@ static void ppc_cpu_reset(CPUState *s) + env->pending_interrupts = 0; + s->exception_index = POWERPC_EXCP_NONE; + env->error_code = 0; ++ ppc_irq_reset(cpu); + + /* tininess for underflow is detected before rounding */ + set_float_detect_tininess(float_tininess_before_rounding, +-- +1.8.3.1 + diff --git a/kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch b/kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch new file mode 100755 index 0000000..457d149 --- /dev/null +++ b/kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch @@ -0,0 +1,112 @@ +From f2f57c1ed926384e074d2048cdbdc30ee2f426eb Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 21 Jan 2020 05:16:13 +0000 +Subject: [PATCH 03/15] ppc: Don't use CPUPPCState::irq_input_state with modern + Book3s CPU models +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200121051613.388295-4-dgibson@redhat.com> +Patchwork-id: 93431 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 3/3] ppc: Don't use CPUPPCState::irq_input_state with modern Book3s CPU models +Bugzilla: 1776638 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +The power7_set_irq() and power9_set_irq() functions set this but it is +never used actually. Modern Book3s compatible CPUs are only supported +by the pnv and spapr machines. They have an interrupt controller, XICS +for POWER7/8 and XIVE for POWER9, whose models don't require to track +IRQ input states at the CPU level. + +Drop these lines to avoid confusion. + +Signed-off-by: Greg Kurz +Message-Id: <157548862861.3650476.16622818876928044450.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit c1ad0b892ce20cf2b5e619c79e8a0c4c66b235dc) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1776638 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/ppc.c | 16 ++-------------- + target/ppc/cpu.h | 4 +++- + 2 files changed, 5 insertions(+), 15 deletions(-) + +diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c +index d554b64..730a41f 100644 +--- a/hw/ppc/ppc.c ++++ b/hw/ppc/ppc.c +@@ -275,10 +275,9 @@ void ppc970_irq_init(PowerPCCPU *cpu) + static void power7_set_irq(void *opaque, int pin, int level) + { + PowerPCCPU *cpu = opaque; +- CPUPPCState *env = &cpu->env; + + LOG_IRQ("%s: env %p pin %d level %d\n", __func__, +- env, pin, level); ++ &cpu->env, pin, level); + + switch (pin) { + case POWER7_INPUT_INT: +@@ -292,11 +291,6 @@ static void power7_set_irq(void *opaque, int pin, int level) + LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); + return; + } +- if (level) { +- env->irq_input_state |= 1 << pin; +- } else { +- env->irq_input_state &= ~(1 << pin); +- } + } + + void ppcPOWER7_irq_init(PowerPCCPU *cpu) +@@ -311,10 +305,9 @@ void ppcPOWER7_irq_init(PowerPCCPU *cpu) + static void power9_set_irq(void *opaque, int pin, int level) + { + PowerPCCPU *cpu = opaque; +- CPUPPCState *env = &cpu->env; + + LOG_IRQ("%s: env %p pin %d level %d\n", __func__, +- env, pin, level); ++ &cpu->env, pin, level); + + switch (pin) { + case POWER9_INPUT_INT: +@@ -334,11 +327,6 @@ static void power9_set_irq(void *opaque, int pin, int level) + LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); + return; + } +- if (level) { +- env->irq_input_state |= 1 << pin; +- } else { +- env->irq_input_state &= ~(1 << pin); +- } + } + + void ppcPOWER9_irq_init(PowerPCCPU *cpu) +diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h +index 5c53801..8887f76 100644 +--- a/target/ppc/cpu.h ++++ b/target/ppc/cpu.h +@@ -1090,7 +1090,9 @@ struct CPUPPCState { + #if !defined(CONFIG_USER_ONLY) + /* + * This is the IRQ controller, which is implementation dependent +- * and only relevant when emulating a complete machine. ++ * and only relevant when emulating a complete machine. Note that ++ * this isn't used by recent Book3s compatible CPUs (POWER7 and ++ * newer). + */ + uint32_t irq_input_state; + void **irq_inputs; +-- +1.8.3.1 + diff --git a/kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch b/kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch new file mode 100755 index 0000000..380007c --- /dev/null +++ b/kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch @@ -0,0 +1,82 @@ +From 5b826e7ed09ecf3b2837d147fec6b593f629e450 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Fri, 4 Dec 2020 15:07:59 -0500 +Subject: [PATCH 01/14] ppc/spapr: Add hotremovable flag on DIMM LMBs on + drmem_v2 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Greg Kurz +Message-id: <20201204150800.264829-2-gkurz@redhat.com> +Patchwork-id: 100217 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/2] ppc/spapr: Add hotremovable flag on DIMM LMBs on drmem_v2 +Bugzilla: 1901837 +RH-Acked-by: Danilo de Paula +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier + +From: Leonardo Bras + +On reboot, all memory that was previously added using object_add and +device_add is placed in this DIMM area. + +The new SPAPR_LMB_FLAGS_HOTREMOVABLE flag helps Linux to put this memory in +the correct memory zone, so no unmovable allocations are made there, +allowing the object to be easily hot-removed by device_del and +object_del. + +This new flag was accepted in Power Architecture documentation. + +Signed-off-by: Leonardo Bras +Reviewed-by: Bharata B Rao +Message-Id: <20200511200201.58537-1-leobras.c@gmail.com> +[dwg: Fixed syntax error spotted by Cédric Le Goater] +Signed-off-by: David Gibson +(cherry picked from commit 0911a60c76b8598f1863c6951b2b690059465153) +Signed-off-by: Greg Kurz + +Conflicts: + hw/ppc/pnv.c + +The changes in this file clearly don't belong to this +patch. Same goes for the changes in target/ppc/cpu.h and +target/ppc/excp_helper.c. Something went wrong when the +patch was applied. Anyway, downstream doesn't especially +care for pnv, so just drop the changes. + +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 3 ++- + include/hw/ppc/spapr.h | 1 + + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index a330f038b95..c74079702d0 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -690,7 +690,8 @@ static int spapr_populate_drmem_v2(SpaprMachineState *spapr, void *fdt, + g_assert(drc); + elem = spapr_get_drconf_cell(size / lmb_size, addr, + spapr_drc_index(drc), node, +- SPAPR_LMB_FLAGS_ASSIGNED); ++ (SPAPR_LMB_FLAGS_ASSIGNED | ++ SPAPR_LMB_FLAGS_HOTREMOVABLE)); + QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry); + nr_entries++; + cur_addr = addr + size; +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index aa89cc4a95c..e047dabf300 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -847,6 +847,7 @@ int spapr_rtc_import_offset(SpaprRtcState *rtc, int64_t legacy_offset); + #define SPAPR_LMB_FLAGS_ASSIGNED 0x00000008 + #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020 + #define SPAPR_LMB_FLAGS_RESERVED 0x00000080 ++#define SPAPR_LMB_FLAGS_HOTREMOVABLE 0x00000100 + + void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg); + +-- +2.27.0 + diff --git a/kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch b/kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch new file mode 100755 index 0000000..ee0b19a --- /dev/null +++ b/kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch @@ -0,0 +1,67 @@ +From e4065c7739c8ea3f6f88898295ed899a1059806e Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Fri, 4 Dec 2020 15:08:00 -0500 +Subject: [PATCH 02/14] ppc/spapr: re-assert IRQs during event-scan if there + are pending + +RH-Author: Greg Kurz +Message-id: <20201204150800.264829-3-gkurz@redhat.com> +Patchwork-id: 100216 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/2] ppc/spapr: re-assert IRQs during event-scan if there are pending +Bugzilla: 1901837 +RH-Acked-by: Danilo de Paula +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier + +From: Laurent Vivier + +If we hotplug a CPU during the first second of the kernel boot, +the IRQ can be sent to the kernel while the RTAS event handler +is not installed. The event is queued, but the kernel doesn't +collect it and ignores the new CPU. + +As the code relies on edge-triggered IRQ, we can re-assert it +during the event-scan RTAS call if there are still pending +events (as it is already done in check-exception). + +Signed-off-by: Laurent Vivier +Message-Id: <20201015210318.117386-1-lvivier@redhat.com> +Reviewed-by: Greg Kurz +Signed-off-by: David Gibson +(cherry picked from commit dff669d6a15fb92b063cb5aa691b4bb498727404) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_events.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c +index e355e000d07..15b92b63adb 100644 +--- a/hw/ppc/spapr_events.c ++++ b/hw/ppc/spapr_events.c +@@ -692,10 +692,22 @@ static void event_scan(PowerPCCPU *cpu, SpaprMachineState *spapr, + target_ulong args, + uint32_t nret, target_ulong rets) + { ++ int i; + if (nargs != 4 || nret != 1) { + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); + return; + } ++ ++ for (i = 0; i < EVENT_CLASS_MAX; i++) { ++ if (rtas_event_log_contains(EVENT_CLASS_MASK(i))) { ++ const SpaprEventSource *source = ++ spapr_event_sources_get_source(spapr->event_sources, i); ++ ++ g_assert(source->enabled); ++ qemu_irq_pulse(spapr_qirq(spapr, source->irq)); ++ } ++ } ++ + rtas_st(rets, 0, RTAS_OUT_NO_ERRORS_FOUND); + } + +-- +2.27.0 + diff --git a/kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch b/kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch new file mode 100755 index 0000000..9c25b76 --- /dev/null +++ b/kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch @@ -0,0 +1,64 @@ +From 428eb7260718b69b1f3f421d03bce10b8785fc49 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:39 +0000 +Subject: [PATCH 19/20] qapi: Add '@allow-write-only-overlay' feature for + 'blockdev-snapshot' + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-14-kwolf@redhat.com> +Patchwork-id: 94290 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 13/13] qapi: Add '@allow-write-only-overlay' feature for 'blockdev-snapshot' +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Peter Krempa + +Anounce that 'blockdev-snapshot' command's permissions allow changing +of the backing file if the 'consistent_read' permission is not required. + +This is useful for libvirt to allow late opening of the backing chain +during a blockdev-mirror. + +Signed-off-by: Peter Krempa +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-8-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit c6bdc312f30d5c7326aa2fdca3e0f98c15eb541a) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + qapi/block-core.json | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/qapi/block-core.json b/qapi/block-core.json +index a1e85b0..a64ad81 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -1541,6 +1541,12 @@ + # + # For the arguments, see the documentation of BlockdevSnapshot. + # ++# Features: ++# @allow-write-only-overlay: If present, the check whether this operation is safe ++# was relaxed so that it can be used to change ++# backing file of a destination of a blockdev-mirror. ++# (since 5.0) ++# + # Since: 2.5 + # + # Example: +@@ -1561,7 +1567,8 @@ + # + ## + { 'command': 'blockdev-snapshot', +- 'data': 'BlockdevSnapshot' } ++ 'data': 'BlockdevSnapshot', ++ 'features': [ 'allow-write-only-overlay' ] } + + ## + # @change-backing-file: +-- +1.8.3.1 + diff --git a/kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch b/kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch new file mode 100755 index 0000000..bf296d8 --- /dev/null +++ b/kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch @@ -0,0 +1,237 @@ +From 34f664093db2a6275fcddd768684c7319cfc01b4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:06 -0500 +Subject: [PATCH 05/14] qapi: enable use of g_autoptr with QAPI types +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-2-marcandre.lureau@redhat.com> +Patchwork-id: 100472 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 01/10] qapi: enable use of g_autoptr with QAPI types +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Daniel P. Berrangé + +Currently QAPI generates a type and function for free'ing it: + + typedef struct QCryptoBlockCreateOptions QCryptoBlockCreateOptions; + void qapi_free_QCryptoBlockCreateOptions(QCryptoBlockCreateOptions *obj); + +This is used in the traditional manner: + + QCryptoBlockCreateOptions *opts = NULL; + + opts = g_new0(QCryptoBlockCreateOptions, 1); + + ....do stuff with opts... + + qapi_free_QCryptoBlockCreateOptions(opts); + +Since bumping the min glib to 2.48, QEMU has incrementally adopted the +use of g_auto/g_autoptr. This allows the compiler to run a function to +free a variable when it goes out of scope, the benefit being the +compiler can guarantee it is freed in all possible code ptahs. + +This benefit is applicable to QAPI types too, and given the seriously +long method names for some qapi_free_XXXX() functions, is much less +typing. This change thus makes the code generator emit: + + G_DEFINE_AUTOPTR_CLEANUP_FUNC(QCryptoBlockCreateOptions, + qapi_free_QCryptoBlockCreateOptions) + +The above code example now becomes + + g_autoptr(QCryptoBlockCreateOptions) opts = NULL; + + opts = g_new0(QCryptoBlockCreateOptions, 1); + + ....do stuff with opts... + +Note, if the local pointer needs to live beyond the scope holding the +variable, then g_steal_pointer can be used. This is useful to return the +pointer to the caller in the success codepath, while letting it be freed +in all error codepaths. + + return g_steal_pointer(&opts); + +The crypto/block.h header needs updating to avoid symbol clash now that +the g_autoptr support is a standard QAPI feature. + +Signed-off-by: Daniel P. Berrangé +Message-Id: <20200723153845.2934357-1-berrange@redhat.com> +Reviewed-by: Markus Armbruster +Reviewed-by: Eric Blake +Signed-off-by: Markus Armbruster + +(cherry picked from commit 221db5daf6b3666f1c8e4ca06ae45892e99a112f) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + docs/devel/qapi-code-gen.txt | 2 ++ + scripts/qapi/types.py | 1 + + tests/test-qobject-input-visitor.c | 23 +++++++---------------- + 3 files changed, 10 insertions(+), 16 deletions(-) + +diff --git a/docs/devel/qapi-code-gen.txt b/docs/devel/qapi-code-gen.txt +index 45c93a43cc3..ca59c695fac 100644 +--- a/docs/devel/qapi-code-gen.txt ++++ b/docs/devel/qapi-code-gen.txt +@@ -1278,6 +1278,7 @@ Example: + }; + + void qapi_free_UserDefOne(UserDefOne *obj); ++ G_DEFINE_AUTOPTR_CLEANUP_FUNC(UserDefOne, qapi_free_UserDefOne) + + struct UserDefOneList { + UserDefOneList *next; +@@ -1285,6 +1286,7 @@ Example: + }; + + void qapi_free_UserDefOneList(UserDefOneList *obj); ++ G_DEFINE_AUTOPTR_CLEANUP_FUNC(UserDefOneList, qapi_free_UserDefOneList) + + struct q_obj_my_command_arg { + UserDefOneList *arg1; +diff --git a/scripts/qapi/types.py b/scripts/qapi/types.py +index d8751daa049..c3be141dc90 100644 +--- a/scripts/qapi/types.py ++++ b/scripts/qapi/types.py +@@ -213,6 +213,7 @@ def gen_type_cleanup_decl(name): + ret = mcgen(''' + + void qapi_free_%(c_name)s(%(c_name)s *obj); ++G_DEFINE_AUTOPTR_CLEANUP_FUNC(%(c_name)s, qapi_free_%(c_name)s) + ''', + c_name=c_name(name)) + return ret +diff --git a/tests/test-qobject-input-visitor.c b/tests/test-qobject-input-visitor.c +index 6bacabf0632..e41b91a2a6f 100644 +--- a/tests/test-qobject-input-visitor.c ++++ b/tests/test-qobject-input-visitor.c +@@ -417,7 +417,7 @@ static void test_visitor_in_struct(TestInputVisitorData *data, + static void test_visitor_in_struct_nested(TestInputVisitorData *data, + const void *unused) + { +- UserDefTwo *udp = NULL; ++ g_autoptr(UserDefTwo) udp = NULL; + Visitor *v; + + v = visitor_input_test_init(data, "{ 'string0': 'string0', " +@@ -433,8 +433,6 @@ static void test_visitor_in_struct_nested(TestInputVisitorData *data, + g_assert_cmpstr(udp->dict1->dict2->userdef->string, ==, "string"); + g_assert_cmpstr(udp->dict1->dict2->string, ==, "string2"); + g_assert(udp->dict1->has_dict3 == false); +- +- qapi_free_UserDefTwo(udp); + } + + static void test_visitor_in_list(TestInputVisitorData *data, +@@ -546,7 +544,7 @@ static void test_visitor_in_union_flat(TestInputVisitorData *data, + const void *unused) + { + Visitor *v; +- UserDefFlatUnion *tmp; ++ g_autoptr(UserDefFlatUnion) tmp = NULL; + UserDefUnionBase *base; + + v = visitor_input_test_init(data, +@@ -563,8 +561,6 @@ static void test_visitor_in_union_flat(TestInputVisitorData *data, + + base = qapi_UserDefFlatUnion_base(tmp); + g_assert(&base->enum1 == &tmp->enum1); +- +- qapi_free_UserDefFlatUnion(tmp); + } + + static void test_visitor_in_alternate(TestInputVisitorData *data, +@@ -690,7 +686,7 @@ static void test_list_union_integer_helper(TestInputVisitorData *data, + const void *unused, + UserDefListUnionKind kind) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); + GString *gstr_union = g_string_new(""); +@@ -782,7 +778,6 @@ static void test_list_union_integer_helper(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + static void test_visitor_in_list_union_int(TestInputVisitorData *data, +@@ -851,7 +846,7 @@ static void test_visitor_in_list_union_uint64(TestInputVisitorData *data, + static void test_visitor_in_list_union_bool(TestInputVisitorData *data, + const void *unused) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + boolList *elem = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); +@@ -879,13 +874,12 @@ static void test_visitor_in_list_union_bool(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + static void test_visitor_in_list_union_string(TestInputVisitorData *data, + const void *unused) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + strList *elem = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); +@@ -914,7 +908,6 @@ static void test_visitor_in_list_union_string(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + #define DOUBLE_STR_MAX 16 +@@ -922,7 +915,7 @@ static void test_visitor_in_list_union_string(TestInputVisitorData *data, + static void test_visitor_in_list_union_number(TestInputVisitorData *data, + const void *unused) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + numberList *elem = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); +@@ -957,7 +950,6 @@ static void test_visitor_in_list_union_number(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + static void input_visitor_test_add(const char *testpath, +@@ -1253,7 +1245,7 @@ static void test_visitor_in_fail_alternate(TestInputVisitorData *data, + static void do_test_visitor_in_qmp_introspect(TestInputVisitorData *data, + const QLitObject *qlit) + { +- SchemaInfoList *schema = NULL; ++ g_autoptr(SchemaInfoList) schema = NULL; + QObject *obj = qobject_from_qlit(qlit); + Visitor *v; + +@@ -1262,7 +1254,6 @@ static void do_test_visitor_in_qmp_introspect(TestInputVisitorData *data, + visit_type_SchemaInfoList(v, NULL, &schema, &error_abort); + g_assert(schema); + +- qapi_free_SchemaInfoList(schema); + qobject_unref(obj); + visit_free(v); + } +-- +2.27.0 + diff --git a/kvm-qcow2-Expose-bitmaps-size-during-measure.patch b/kvm-qcow2-Expose-bitmaps-size-during-measure.patch new file mode 100755 index 0000000..48c15c5 --- /dev/null +++ b/kvm-qcow2-Expose-bitmaps-size-during-measure.patch @@ -0,0 +1,495 @@ +From af4d66e07c86d7593f7d18ae4b6a2151123b529b Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:17 +0100 +Subject: [PATCH 12/26] qcow2: Expose bitmaps' size during measure + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-10-eblake@redhat.com> +Patchwork-id: 97072 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 09/12] qcow2: Expose bitmaps' size during measure +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +It's useful to know how much space can be occupied by qcow2 persistent +bitmaps, even though such metadata is unrelated to the guest-visible +data. Report this value as an additional QMP field, present when +measuring an existing image and output format that both support +bitmaps. Update iotest 178 and 190 to updated output, as well as new +coverage in 190 demonstrating non-zero values made possible with the +recently-added qemu-img bitmap command (see 3b51ab4b). + +The new 'bitmaps size:' field is displayed automatically as part of +'qemu-img measure' any time it is present in QMP (that is, any time +both the source image being measured and destination format support +bitmaps, even if the measurement is 0 because there are no bitmaps +present). If the field is absent, it means that no bitmaps can be +copied (source, destination, or both lack bitmaps, including when +measuring based on size rather than on a source image). This behavior +is compatible with an upcoming patch adding 'qemu-img convert +--bitmaps': that command will fail in the same situations where this +patch omits the field. + +The addition of a new field demonstrates why we should always +zero-initialize qapi C structs; while the qcow2 driver still fully +populates all fields, the raw and crypto drivers had to be tweaked to +avoid uninitialized data. + +Consideration was also given towards having a 'qemu-img measure +--bitmaps' which errors out when bitmaps are not possible, and +otherwise sums the bitmaps into the existing allocation totals rather +than displaying as a separate field, as a potential convenience +factor. But this was ultimately decided to be more complexity than +necessary when the QMP interface was sufficient enough with bitmaps +remaining a separate field. + +See also: https://bugzilla.redhat.com/1779904 + +Reported-by: Nir Soffer +Signed-off-by: Eric Blake +Message-Id: <20200521192137.1120211-3-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 5d72c68b49769c927e90b78af6d90f6a384b26ac) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + block/crypto.c - commit a9da6e49 not present (no measure support) + docs/tools/qemu-img.rst - changes in qemu-img.texi instead +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-bitmap.c | 36 ++++++++++++++++++++++++++++++ + block/qcow2.c | 14 +++++++++--- + block/qcow2.h | 2 ++ + block/raw-format.c | 2 +- + qapi/block-core.json | 16 +++++++++----- + qemu-img.c | 3 +++ + qemu-img.texi | 7 ++++++ + tests/qemu-iotests/178.out.qcow2 | 16 ++++++++++++++ + tests/qemu-iotests/190 | 47 ++++++++++++++++++++++++++++++++++++++-- + tests/qemu-iotests/190.out | 27 ++++++++++++++++++++++- + 10 files changed, 158 insertions(+), 12 deletions(-) + +diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c +index cbac905..10d1297 100644 +--- a/block/qcow2-bitmap.c ++++ b/block/qcow2-bitmap.c +@@ -1766,3 +1766,39 @@ bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs) + + return s->qcow_version >= 3; + } ++ ++/* ++ * Compute the space required for bitmaps in @bs. ++ * ++ * The computation is based as if copying to a new image with the ++ * given @cluster_size, which may differ from the cluster size in @bs. ++ */ ++uint64_t qcow2_get_persistent_dirty_bitmap_size(BlockDriverState *bs, ++ uint32_t cluster_size) ++{ ++ uint64_t bitmaps_size = 0; ++ BdrvDirtyBitmap *bm; ++ size_t bitmap_dir_size = 0; ++ ++ FOR_EACH_DIRTY_BITMAP(bs, bm) { ++ if (bdrv_dirty_bitmap_get_persistence(bm)) { ++ const char *name = bdrv_dirty_bitmap_name(bm); ++ uint32_t granularity = bdrv_dirty_bitmap_granularity(bm); ++ uint64_t bmbytes = ++ get_bitmap_bytes_needed(bdrv_dirty_bitmap_size(bm), ++ granularity); ++ uint64_t bmclusters = DIV_ROUND_UP(bmbytes, cluster_size); ++ ++ /* Assume the entire bitmap is allocated */ ++ bitmaps_size += bmclusters * cluster_size; ++ /* Also reserve space for the bitmap table entries */ ++ bitmaps_size += ROUND_UP(bmclusters * sizeof(uint64_t), ++ cluster_size); ++ /* And space for contribution to bitmap directory size */ ++ bitmap_dir_size += calc_dir_entry_size(strlen(name), 0); ++ } ++ } ++ bitmaps_size += ROUND_UP(bitmap_dir_size, cluster_size); ++ ++ return bitmaps_size; ++} +diff --git a/block/qcow2.c b/block/qcow2.c +index 36b0f7d..dbd870a 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -4751,16 +4751,24 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, + required = virtual_size; + } + +- info = g_new(BlockMeasureInfo, 1); ++ info = g_new0(BlockMeasureInfo, 1); + info->fully_allocated = + qcow2_calc_prealloc_size(virtual_size, cluster_size, + ctz32(refcount_bits)) + luks_payload_size; + +- /* Remove data clusters that are not required. This overestimates the ++ /* ++ * Remove data clusters that are not required. This overestimates the + * required size because metadata needed for the fully allocated file is +- * still counted. ++ * still counted. Show bitmaps only if both source and destination ++ * would support them. + */ + info->required = info->fully_allocated - virtual_size + required; ++ info->has_bitmaps = version >= 3 && in_bs && ++ bdrv_supports_persistent_dirty_bitmap(in_bs); ++ if (info->has_bitmaps) { ++ info->bitmaps = qcow2_get_persistent_dirty_bitmap_size(in_bs, ++ cluster_size); ++ } + return info; + + err: +diff --git a/block/qcow2.h b/block/qcow2.h +index ceb1ceb..3297e6b 100644 +--- a/block/qcow2.h ++++ b/block/qcow2.h +@@ -768,6 +768,8 @@ int qcow2_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); + bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs); ++uint64_t qcow2_get_persistent_dirty_bitmap_size(BlockDriverState *bs, ++ uint32_t cluster_size); + + ssize_t coroutine_fn + qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, +diff --git a/block/raw-format.c b/block/raw-format.c +index 93b25e1..4bb54f4 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -346,7 +346,7 @@ static BlockMeasureInfo *raw_measure(QemuOpts *opts, BlockDriverState *in_bs, + BDRV_SECTOR_SIZE); + } + +- info = g_new(BlockMeasureInfo, 1); ++ info = g_new0(BlockMeasureInfo, 1); + info->required = required; + + /* Unallocated sectors count towards the file size in raw images */ +diff --git a/qapi/block-core.json b/qapi/block-core.json +index a64ad81..2893209 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -689,18 +689,24 @@ + # efficiently so file size may be smaller than virtual disk size. + # + # The values are upper bounds that are guaranteed to fit the new image file. +-# Subsequent modification, such as internal snapshot or bitmap creation, may +-# require additional space and is not covered here. ++# Subsequent modification, such as internal snapshot or further bitmap ++# creation, may require additional space and is not covered here. + # +-# @required: Size required for a new image file, in bytes. ++# @required: Size required for a new image file, in bytes, when copying just ++# allocated guest-visible contents. + # + # @fully-allocated: Image file size, in bytes, once data has been written +-# to all sectors. ++# to all sectors, when copying just guest-visible contents. ++# ++# @bitmaps: Additional size required if all the top-level bitmap metadata ++# in the source image were to be copied to the destination, ++# present only when source and destination both support ++# persistent bitmaps. (since 5.1) + # + # Since: 2.10 + ## + { 'struct': 'BlockMeasureInfo', +- 'data': {'required': 'int', 'fully-allocated': 'int'} } ++ 'data': {'required': 'int', 'fully-allocated': 'int', '*bitmaps': 'int'} } + + ## + # @query-block: +diff --git a/qemu-img.c b/qemu-img.c +index 11a4537..b57856e 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -5212,6 +5212,9 @@ static int img_measure(int argc, char **argv) + if (output_format == OFORMAT_HUMAN) { + printf("required size: %" PRIu64 "\n", info->required); + printf("fully allocated size: %" PRIu64 "\n", info->fully_allocated); ++ if (info->has_bitmaps) { ++ printf("bitmaps size: %" PRIu64 "\n", info->bitmaps); ++ } + } else { + dump_json_block_measure_info(info); + } +diff --git a/qemu-img.texi b/qemu-img.texi +index abf2771..3670b96 100644 +--- a/qemu-img.texi ++++ b/qemu-img.texi +@@ -576,6 +576,7 @@ The following fields are reported: + @example + required size: 524288 + fully allocated size: 1074069504 ++bitmaps size: 0 + @end example + + The @code{required size} is the file size of the new image. It may be smaller +@@ -586,6 +587,12 @@ been written to all sectors. This is the maximum size that the image file can + occupy with the exception of internal snapshots, dirty bitmaps, vmstate data, + and other advanced image format features. + ++The @code{bitmaps size} is the additional size required in order to ++copy bitmaps from a source image in addition to the guest-visible ++data; the line is omitted if either source or destination lacks ++bitmap support, or 0 if bitmaps are supported but there is nothing to ++copy. ++ + @item snapshot [--object @var{objectdef}] [--image-opts] [-U] [-q] [-l | -a @var{snapshot} | -c @var{snapshot} | -d @var{snapshot}] @var{filename} + + List, apply, create or delete snapshots in image @var{filename}. +diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2 +index 345eab3..b9ed41b 100644 +--- a/tests/qemu-iotests/178.out.qcow2 ++++ b/tests/qemu-iotests/178.out.qcow2 +@@ -37,6 +37,7 @@ qemu-img: The image size is too large (try using a larger cluster size) + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=0 + required size: 196608 + fully allocated size: 196608 ++bitmaps size: 0 + + converted image file size in bytes: 196608 + +@@ -45,6 +46,7 @@ converted image file size in bytes: 196608 + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 + required size: 393216 + fully allocated size: 1074135040 ++bitmaps size: 0 + wrote 512/512 bytes at offset 512 + 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + wrote 65536/65536 bytes at offset 65536 +@@ -53,6 +55,7 @@ wrote 64512/64512 bytes at offset 134217728 + 63 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + required size: 589824 + fully allocated size: 1074135040 ++bitmaps size: 0 + + converted image file size in bytes: 524288 + +@@ -60,6 +63,7 @@ converted image file size in bytes: 524288 + + required size: 524288 + fully allocated size: 1074135040 ++bitmaps size: 0 + + converted image file size in bytes: 458752 + +@@ -67,16 +71,19 @@ converted image file size in bytes: 458752 + + required size: 1074135040 + fully allocated size: 1074135040 ++bitmaps size: 0 + + == qcow2 input image and LUKS encryption == + + required size: 2686976 + fully allocated size: 1076232192 ++bitmaps size: 0 + + == qcow2 input image and preallocation (human) == + + required size: 1074135040 + fully allocated size: 1074135040 ++bitmaps size: 0 + + converted image file size in bytes: 1074135040 + +@@ -87,6 +94,7 @@ wrote 8388608/8388608 bytes at offset 0 + 8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + required size: 8716288 + fully allocated size: 8716288 ++bitmaps size: 0 + + converted image file size in bytes: 8716288 + +@@ -173,6 +181,7 @@ qemu-img: The image size is too large (try using a larger cluster size) + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=0 + { ++ "bitmaps": 0, + "required": 196608, + "fully-allocated": 196608 + } +@@ -183,6 +192,7 @@ converted image file size in bytes: 196608 + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 + { ++ "bitmaps": 0, + "required": 393216, + "fully-allocated": 1074135040 + } +@@ -193,6 +203,7 @@ wrote 65536/65536 bytes at offset 65536 + wrote 64512/64512 bytes at offset 134217728 + 63 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { ++ "bitmaps": 0, + "required": 589824, + "fully-allocated": 1074135040 + } +@@ -202,6 +213,7 @@ converted image file size in bytes: 524288 + == qcow2 input image with internal snapshot (json) == + + { ++ "bitmaps": 0, + "required": 524288, + "fully-allocated": 1074135040 + } +@@ -211,6 +223,7 @@ converted image file size in bytes: 458752 + == qcow2 input image and a backing file (json) == + + { ++ "bitmaps": 0, + "required": 1074135040, + "fully-allocated": 1074135040 + } +@@ -218,6 +231,7 @@ converted image file size in bytes: 458752 + == qcow2 input image and LUKS encryption == + + { ++ "bitmaps": 0, + "required": 2686976, + "fully-allocated": 1076232192 + } +@@ -225,6 +239,7 @@ converted image file size in bytes: 458752 + == qcow2 input image and preallocation (json) == + + { ++ "bitmaps": 0, + "required": 1074135040, + "fully-allocated": 1074135040 + } +@@ -237,6 +252,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608 + wrote 8388608/8388608 bytes at offset 0 + 8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { ++ "bitmaps": 0, + "required": 8716288, + "fully-allocated": 8716288 + } +diff --git a/tests/qemu-iotests/190 b/tests/qemu-iotests/190 +index eb766ad..5084ccd 100755 +--- a/tests/qemu-iotests/190 ++++ b/tests/qemu-iotests/190 +@@ -2,7 +2,7 @@ + # + # qemu-img measure sub-command tests on huge qcow2 files + # +-# Copyright (C) 2017 Red Hat, Inc. ++# Copyright (C) 2017-2020 Red Hat, Inc. + # + # This program is free software; you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by +@@ -42,7 +42,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 + _supported_fmt qcow2 + _supported_proto file + +-echo "== Huge file ==" ++echo "== Huge file without bitmaps ==" + echo + + IMGOPTS='cluster_size=2M' _make_test_img 2T +@@ -51,6 +51,49 @@ $QEMU_IMG measure -O raw -f qcow2 "$TEST_IMG" + $QEMU_IMG measure -O qcow2 -o cluster_size=64k -f qcow2 "$TEST_IMG" + $QEMU_IMG measure -O qcow2 -o cluster_size=2M -f qcow2 "$TEST_IMG" + ++echo ++echo "== Huge file with bitmaps ==" ++echo ++ ++$QEMU_IMG bitmap --add --granularity 512 -f qcow2 "$TEST_IMG" b1 ++$QEMU_IMG bitmap --add -g 2M -f qcow2 "$TEST_IMG" b2 ++ ++# No bitmap without a source ++$QEMU_IMG measure -O qcow2 --size 10M ++# No bitmap output, since raw does not support it ++$QEMU_IMG measure -O raw -f qcow2 "$TEST_IMG" ++# No bitmap output, since no bitmaps on raw source. Munge required size, as ++# some filesystems store the qcow2 file with less sparseness than others ++$QEMU_IMG measure -O qcow2 -f raw "$TEST_IMG" | ++ sed '/^required size:/ s/[0-9][0-9]*/SIZE/' ++# No bitmap output, since v2 does not support it ++$QEMU_IMG measure -O qcow2 -o compat=0.10 -f qcow2 "$TEST_IMG" ++ ++# Compute expected output: bitmap clusters + bitmap tables + bitmaps directory ++echo ++val2T=$((2*1024*1024*1024*1024)) ++cluster=$((64*1024)) ++b1clusters=$(( (val2T/512/8 + cluster - 1) / cluster )) ++b2clusters=$(( (val2T/2/1024/1024/8 + cluster - 1) / cluster )) ++echo expected bitmap $((b1clusters * cluster + ++ (b1clusters * 8 + cluster - 1) / cluster * cluster + ++ b2clusters * cluster + ++ (b2clusters * 8 + cluster - 1) / cluster * cluster + ++ cluster)) ++$QEMU_IMG measure -O qcow2 -o cluster_size=64k -f qcow2 "$TEST_IMG" ++ ++# Compute expected output: bitmap clusters + bitmap tables + bitmaps directory ++echo ++cluster=$((2*1024*1024)) ++b1clusters=$(( (val2T/512/8 + cluster - 1) / cluster )) ++b2clusters=$(( (val2T/2/1024/1024/8 + cluster - 1) / cluster )) ++echo expected bitmap $((b1clusters * cluster + ++ (b1clusters * 8 + cluster - 1) / cluster * cluster + ++ b2clusters * cluster + ++ (b2clusters * 8 + cluster - 1) / cluster * cluster + ++ cluster)) ++$QEMU_IMG measure --output=json -O qcow2 -o cluster_size=2M -f qcow2 "$TEST_IMG" ++ + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/190.out b/tests/qemu-iotests/190.out +index d001942..ed9d821 100644 +--- a/tests/qemu-iotests/190.out ++++ b/tests/qemu-iotests/190.out +@@ -1,11 +1,36 @@ + QA output created by 190 +-== Huge file == ++== Huge file without bitmaps == + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=2199023255552 + required size: 2199023255552 + fully allocated size: 2199023255552 + required size: 335806464 + fully allocated size: 2199359062016 ++bitmaps size: 0 + required size: 18874368 + fully allocated size: 2199042129920 ++bitmaps size: 0 ++ ++== Huge file with bitmaps == ++ ++required size: 327680 ++fully allocated size: 10813440 ++required size: 2199023255552 ++fully allocated size: 2199023255552 ++required size: SIZE ++fully allocated size: 17170432 ++required size: 335806464 ++fully allocated size: 2199359062016 ++ ++expected bitmap 537198592 ++required size: 335806464 ++fully allocated size: 2199359062016 ++bitmaps size: 537198592 ++ ++expected bitmap 545259520 ++{ ++ "bitmaps": 545259520, ++ "required": 18874368, ++ "fully-allocated": 2199042129920 ++} + *** done +-- +1.8.3.1 + diff --git a/kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch b/kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch new file mode 100755 index 0000000..43ff282 --- /dev/null +++ b/kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch @@ -0,0 +1,47 @@ +From bd97bbbce54da301407d51cae35e09ba2a12b160 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 13 Jul 2020 14:24:48 -0400 +Subject: [PATCH 1/4] qcow2: Fix alloc_cluster_abort() for pre-existing + clusters + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-2-mreitz@redhat.com> +Patchwork-id: 97954 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/4] qcow2: Fix alloc_cluster_abort() for pre-existing clusters +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +handle_alloc() reuses preallocated zero clusters. If anything goes +wrong during the data write, we do not change their L2 entry, so we +must not let qcow2_alloc_cluster_abort() free them. + +Fixes: 8b24cd141549b5b264baeddd4e72902cfb5de23b +Cc: qemu-stable@nongnu.org +Signed-off-by: Max Reitz +Message-Id: <20200225143130.111267-2-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 3ede935fdbbd5f7b24b4724bbfb8938acb5956d8) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-cluster.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c +index 9d04f8d77b..1970797ce5 100644 +--- a/block/qcow2-cluster.c ++++ b/block/qcow2-cluster.c +@@ -1015,7 +1015,7 @@ err: + void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) + { + BDRVQcow2State *s = bs->opaque; +- if (!has_data_file(bs)) { ++ if (!has_data_file(bs) && !m->keep_old_clusters) { + qcow2_free_clusters(bs, m->alloc_offset, + m->nb_clusters << s->cluster_bits, + QCOW2_DISCARD_NEVER); +-- +2.27.0 + diff --git a/kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch b/kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch new file mode 100755 index 0000000..1a7ace5 --- /dev/null +++ b/kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch @@ -0,0 +1,52 @@ +From ecc4fb6e1941035e1d9def1f69b779fbea216caf Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 24 Feb 2020 16:13:07 +0000 +Subject: [PATCH 7/9] qcow2: Fix qcow2_alloc_cluster_abort() for external data + file + +RH-Author: Kevin Wolf +Message-id: <20200224161307.29783-2-kwolf@redhat.com> +Patchwork-id: 94042 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] qcow2: Fix qcow2_alloc_cluster_abort() for external data file +Bugzilla: 1703907 +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +For external data file, cluster allocations return an offset in the data +file and are not refcounted. In this case, there is nothing to do for +qcow2_alloc_cluster_abort(). Freeing the same offset in the qcow2 file +is wrong and causes crashes in the better case or image corruption in +the worse case. + +Signed-off-by: Kevin Wolf +Message-Id: <20200211094900.17315-3-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit c3b6658c1a5a3fb24d6c27b2594cf86146f75b22) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-cluster.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c +index 8982b7b..dc3c270 100644 +--- a/block/qcow2-cluster.c ++++ b/block/qcow2-cluster.c +@@ -1015,8 +1015,11 @@ err: + void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) + { + BDRVQcow2State *s = bs->opaque; +- qcow2_free_clusters(bs, m->alloc_offset, m->nb_clusters << s->cluster_bits, +- QCOW2_DISCARD_NEVER); ++ if (!has_data_file(bs)) { ++ qcow2_free_clusters(bs, m->alloc_offset, ++ m->nb_clusters << s->cluster_bits, ++ QCOW2_DISCARD_NEVER); ++ } + } + + /* +-- +1.8.3.1 + diff --git a/kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch b/kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch new file mode 100755 index 0000000..522ba60 --- /dev/null +++ b/kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch @@ -0,0 +1,98 @@ +From 4290173219e15065e9a7c2e95774ac979b5fd869 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:40 +0100 +Subject: [PATCH 12/17] qcow2: Forward ZERO_WRITE flag for full preallocation + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-12-kwolf@redhat.com> +Patchwork-id: 97456 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 11/11] qcow2: Forward ZERO_WRITE flag for full preallocation +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +The BDRV_REQ_ZERO_WRITE is currently implemented in a way that first the +image is possibly preallocated and then the zero flag is added to all +clusters. This means that a copy-on-write operation may be needed when +writing to these clusters, despite having used preallocation, negating +one of the major benefits of preallocation. + +Instead, try to forward the BDRV_REQ_ZERO_WRITE to the protocol driver, +and if the protocol driver can ensure that the new area reads as zeros, +we can skip setting the zero flag in the qcow2 layer. + +Unfortunately, the same approach doesn't work for metadata +preallocation, so we'll still set the zero flag there. + +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +Message-Id: <20200424142701.67053-1-kwolf@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit eb8a0cf3ba26611f3981f8f45ac6a868975a68cc) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2.c | 22 +++++++++++++++++++--- + tests/qemu-iotests/274.out | 4 ++-- + 2 files changed, 21 insertions(+), 5 deletions(-) + +diff --git a/block/qcow2.c b/block/qcow2.c +index f3d6cb0..b783662 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -4153,9 +4153,25 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + /* Allocate the data area */ + new_file_size = allocation_start + + nb_new_data_clusters * s->cluster_size; +- /* Image file grows, so @exact does not matter */ +- ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0, +- errp); ++ /* ++ * Image file grows, so @exact does not matter. ++ * ++ * If we need to zero out the new area, try first whether the protocol ++ * driver can already take care of this. ++ */ ++ if (flags & BDRV_REQ_ZERO_WRITE) { ++ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, ++ BDRV_REQ_ZERO_WRITE, NULL); ++ if (ret >= 0) { ++ flags &= ~BDRV_REQ_ZERO_WRITE; ++ } ++ } else { ++ ret = -1; ++ } ++ if (ret < 0) { ++ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0, ++ errp); ++ } + if (ret < 0) { + error_prepend(errp, "Failed to resize underlying file: "); + qcow2_free_clusters(bs, allocation_start, +diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out +index 1a796fd..9d6fdeb 100644 +--- a/tests/qemu-iotests/274.out ++++ b/tests/qemu-iotests/274.out +@@ -187,7 +187,7 @@ read 65536/65536 bytes at offset 9437184 + 10 MiB (0xa00000) bytes allocated at offset 5 MiB (0x500000) + + [{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false}, +-{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++{ "start": 5242880, "length": 10485760, "depth": 0, "zero": false, "data": true, "offset": 327680}] + + === preallocation=full === + Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 +@@ -206,7 +206,7 @@ read 65536/65536 bytes at offset 11534336 + 4 MiB (0x400000) bytes allocated at offset 8 MiB (0x800000) + + [{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false}, +-{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++{ "start": 8388608, "length": 4194304, "depth": 0, "zero": false, "data": true, "offset": 327680}] + + === preallocation=off === + Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 +-- +1.8.3.1 + diff --git a/kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch b/kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch new file mode 100755 index 0000000..454759e --- /dev/null +++ b/kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch @@ -0,0 +1,101 @@ +From 3e603e344b81b3ecfea6fb9589ba91f70a22139d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:33 +0100 +Subject: [PATCH 05/17] qcow2: Support BDRV_REQ_ZERO_WRITE for truncate + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-5-kwolf@redhat.com> +Patchwork-id: 97449 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 04/11] qcow2: Support BDRV_REQ_ZERO_WRITE for truncate +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +If BDRV_REQ_ZERO_WRITE is set and we're extending the image, calling +qcow2_cluster_zeroize() with flags=0 does the right thing: It doesn't +undo any previous preallocation, but just adds the zero flag to all +relevant L2 entries. If an external data file is in use, a write_zeroes +request to the data file is made instead. + +Signed-off-by: Kevin Wolf +Message-Id: <20200424125448.63318-5-kwolf@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit f01643fb8b47e8a70c04bbf45e0f12a9e5bc54de) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-cluster.c | 2 +- + block/qcow2.c | 34 ++++++++++++++++++++++++++++++++++ + 2 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c +index dc3c270..9d04f8d 100644 +--- a/block/qcow2-cluster.c ++++ b/block/qcow2-cluster.c +@@ -1784,7 +1784,7 @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset, + /* Caller must pass aligned values, except at image end */ + assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); + assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || +- end_offset == bs->total_sectors << BDRV_SECTOR_BITS); ++ end_offset >= bs->total_sectors << BDRV_SECTOR_BITS); + + /* The zero flag is only supported by version 3 and newer */ + if (s->qcow_version < 3) { +diff --git a/block/qcow2.c b/block/qcow2.c +index 86aa74a..f3d6cb0 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -1726,6 +1726,7 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, + } + + bs->supported_zero_flags = header.version >= 3 ? BDRV_REQ_MAY_UNMAP : 0; ++ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; + + /* Repair image if dirty */ + if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && +@@ -4197,6 +4198,39 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + g_assert_not_reached(); + } + ++ if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) { ++ uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->cluster_size); ++ ++ /* ++ * Use zero clusters as much as we can. qcow2_cluster_zeroize() ++ * requires a cluster-aligned start. The end may be unaligned if it is ++ * at the end of the image (which it is here). ++ */ ++ ret = qcow2_cluster_zeroize(bs, zero_start, offset - zero_start, 0); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "Failed to zero out new clusters"); ++ goto fail; ++ } ++ ++ /* Write explicit zeros for the unaligned head */ ++ if (zero_start > old_length) { ++ uint64_t len = zero_start - old_length; ++ uint8_t *buf = qemu_blockalign0(bs, len); ++ QEMUIOVector qiov; ++ qemu_iovec_init_buf(&qiov, buf, len); ++ ++ qemu_co_mutex_unlock(&s->lock); ++ ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0); ++ qemu_co_mutex_lock(&s->lock); ++ ++ qemu_vfree(buf); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "Failed to zero out the new area"); ++ goto fail; ++ } ++ } ++ } ++ + if (prealloc != PREALLOC_MODE_OFF) { + /* Flush metadata before actually changing the image size */ + ret = qcow2_write_caches(bs); +-- +1.8.3.1 + diff --git a/kvm-qemu-file-Don-t-do-IO-after-shutdown.patch b/kvm-qemu-file-Don-t-do-IO-after-shutdown.patch new file mode 100755 index 0000000..88a6e31 --- /dev/null +++ b/kvm-qemu-file-Don-t-do-IO-after-shutdown.patch @@ -0,0 +1,92 @@ +From d84814e298e3b05fb5bc61cc8e641a5e104d32d5 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:39 +0000 +Subject: [PATCH 07/18] qemu-file: Don't do IO after shutdown + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-7-quintela@redhat.com> +Patchwork-id: 94116 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 06/10] qemu-file: Don't do IO after shutdown +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +Be sure that we are not doing neither read/write after shutdown of the +QEMUFile. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit a555b8092abc6f1bbe4b64c516679cbd68fcfbd8) +Signed-off-by: Danilo C. L. de Paula +--- + migration/qemu-file.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +diff --git a/migration/qemu-file.c b/migration/qemu-file.c +index 26fb25d..bbb2b63 100644 +--- a/migration/qemu-file.c ++++ b/migration/qemu-file.c +@@ -53,6 +53,8 @@ struct QEMUFile { + + int last_error; + Error *last_error_obj; ++ /* has the file has been shutdown */ ++ bool shutdown; + }; + + /* +@@ -61,10 +63,18 @@ struct QEMUFile { + */ + int qemu_file_shutdown(QEMUFile *f) + { ++ int ret; ++ ++ f->shutdown = true; + if (!f->ops->shut_down) { + return -ENOSYS; + } +- return f->ops->shut_down(f->opaque, true, true, NULL); ++ ret = f->ops->shut_down(f->opaque, true, true, NULL); ++ ++ if (!f->last_error) { ++ qemu_file_set_error(f, -EIO); ++ } ++ return ret; + } + + /* +@@ -214,6 +224,9 @@ void qemu_fflush(QEMUFile *f) + return; + } + ++ if (f->shutdown) { ++ return; ++ } + if (f->iovcnt > 0) { + expect = iov_size(f->iov, f->iovcnt); + ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos, +@@ -328,6 +341,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f) + f->buf_index = 0; + f->buf_size = pending; + ++ if (f->shutdown) { ++ return 0; ++ } ++ + len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos, + IO_BUF_SIZE - pending, &local_error); + if (len > 0) { +@@ -642,6 +659,9 @@ int64_t qemu_ftell(QEMUFile *f) + + int qemu_file_rate_limit(QEMUFile *f) + { ++ if (f->shutdown) { ++ return 1; ++ } + if (qemu_file_get_error(f)) { + return 1; + } +-- +1.8.3.1 + diff --git a/kvm-qemu-img-Add-bitmap-sub-command.patch b/kvm-qemu-img-Add-bitmap-sub-command.patch new file mode 100755 index 0000000..eb80188 --- /dev/null +++ b/kvm-qemu-img-Add-bitmap-sub-command.patch @@ -0,0 +1,398 @@ +From 53baacb72e8561391841363b2acbd85a783cbc66 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:15 +0100 +Subject: [PATCH 10/26] qemu-img: Add bitmap sub-command + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-8-eblake@redhat.com> +Patchwork-id: 97074 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 07/12] qemu-img: Add bitmap sub-command +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Include actions for --add, --remove, --clear, --enable, --disable, and +--merge (note that --clear is a bit of fluff, because the same can be +accomplished by removing a bitmap and then adding a new one in its +place, but it matches what QMP commands exist). Listing is omitted, +because it does not require a bitmap name and because it was already +possible with 'qemu-img info'. A single command line can play one or +more bitmap commands in sequence on the same bitmap name (although all +added bitmaps share the same granularity, and and all merged bitmaps +come from the same source file). Merge defaults to other bitmaps in +the primary image, but can also be told to merge bitmaps from a +distinct image. + +While this supports --image-opts for the file being modified, I did +not think it worth the extra complexity to support that for the source +file in a cross-file merges. Likewise, I chose to have --merge only +take a single source rather than following the QMP support for +multiple merges in one go (although you can still use more than one +--merge in the command line); in part because qemu-img is offline and +therefore atomicity is not an issue. + +Upcoming patches will add iotest coverage of these commands while +also testing other features. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Message-Id: <20200513011648.166876-7-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 3b51ab4bf0f49a01cc2db7b954e0669e081719b5) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + docs/tools/qemu-img.rst - lives in qemu-img.texi instead; plus + fix a typo in the text for --merge rather than waiting for + a one-line upstream followup patch + qemu-img-cmds.hx - context, use texi instead of rst + qemu-img.c - context +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img-cmds.hx | 6 ++ + qemu-img.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + qemu-img.texi | 27 ++++++ + 3 files changed, 281 insertions(+) + +diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx +index 1c93e6d..1a6a8e9 100644 +--- a/qemu-img-cmds.hx ++++ b/qemu-img-cmds.hx +@@ -25,6 +25,12 @@ STEXI + @item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} + ETEXI + ++DEF("bitmap", img_bitmap, ++ "bitmap (--merge SOURCE | --add | --remove | --clear | --enable | --disable)... [-b source_file [-F source_fmt]] [-g granularity] [--object objectdef] [--image-opts | -f fmt] filename bitmap") ++STEXI ++.. option:: bitmap (--merge @var{source} | --add | --remove | --clear | --enable | --disable)... [-b @var{source_file} [-F @var{source_fmt}]] [-g @var{granularity}] [--object @var{objectdef}] [--image-opts | -f @var{fmt}] @var{filename} @var{bitmap} ++ETEXI ++ + DEF("check", img_check, + "check [--object objectdef] [--image-opts] [-q] [-f fmt] [--output=ofmt] [-r [leaks | all]] [-T src_cache] [-U] filename") + STEXI +diff --git a/qemu-img.c b/qemu-img.c +index e69529b..11a4537 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -28,6 +28,7 @@ + #include "qemu-common.h" + #include "qemu-version.h" + #include "qapi/error.h" ++#include "qapi/qapi-commands-block-core.h" + #include "qapi/qapi-visit-block-core.h" + #include "qapi/qobject-output-visitor.h" + #include "qapi/qmp/qjson.h" +@@ -70,6 +71,12 @@ enum { + OPTION_PREALLOCATION = 265, + OPTION_SHRINK = 266, + OPTION_SALVAGE = 267, ++ OPTION_ADD = 269, ++ OPTION_REMOVE = 270, ++ OPTION_CLEAR = 271, ++ OPTION_ENABLE = 272, ++ OPTION_DISABLE = 273, ++ OPTION_MERGE = 274, + }; + + typedef enum OutputFormat { +@@ -168,6 +175,14 @@ static void QEMU_NORETURN help(void) + " '-n' skips the target volume creation (useful if the volume is created\n" + " prior to running qemu-img)\n" + "\n" ++ "Parameters to bitmap subcommand:\n" ++ " 'bitmap' is the name of the bitmap to manipulate, through one or more\n" ++ " actions from '--add', '--remove', '--clear', '--enable', '--disable',\n" ++ " or '--merge source'\n" ++ " '-g granularity' sets the granularity for '--add' actions\n" ++ " '-b source' and '-F src_fmt' tell '--merge' actions to find the source\n" ++ " bitmaps from an alternative file\n" ++ "\n" + "Parameters to check subcommand:\n" + " '-r' tries to repair any inconsistencies that are found during the check.\n" + " '-r leaks' repairs only cluster leaks, whereas '-r all' fixes all\n" +@@ -4402,6 +4417,239 @@ out: + return 0; + } + ++enum ImgBitmapAct { ++ BITMAP_ADD, ++ BITMAP_REMOVE, ++ BITMAP_CLEAR, ++ BITMAP_ENABLE, ++ BITMAP_DISABLE, ++ BITMAP_MERGE, ++}; ++typedef struct ImgBitmapAction { ++ enum ImgBitmapAct act; ++ const char *src; /* only used for merge */ ++ QSIMPLEQ_ENTRY(ImgBitmapAction) next; ++} ImgBitmapAction; ++ ++static int img_bitmap(int argc, char **argv) ++{ ++ Error *err = NULL; ++ int c, ret = 1; ++ QemuOpts *opts = NULL; ++ const char *fmt = NULL, *src_fmt = NULL, *src_filename = NULL; ++ const char *filename, *bitmap; ++ BlockBackend *blk = NULL, *src = NULL; ++ BlockDriverState *bs = NULL, *src_bs = NULL; ++ bool image_opts = false; ++ int64_t granularity = 0; ++ bool add = false, merge = false; ++ QSIMPLEQ_HEAD(, ImgBitmapAction) actions; ++ ImgBitmapAction *act, *act_next; ++ const char *op; ++ ++ QSIMPLEQ_INIT(&actions); ++ ++ for (;;) { ++ static const struct option long_options[] = { ++ {"help", no_argument, 0, 'h'}, ++ {"object", required_argument, 0, OPTION_OBJECT}, ++ {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS}, ++ {"add", no_argument, 0, OPTION_ADD}, ++ {"remove", no_argument, 0, OPTION_REMOVE}, ++ {"clear", no_argument, 0, OPTION_CLEAR}, ++ {"enable", no_argument, 0, OPTION_ENABLE}, ++ {"disable", no_argument, 0, OPTION_DISABLE}, ++ {"merge", required_argument, 0, OPTION_MERGE}, ++ {"granularity", required_argument, 0, 'g'}, ++ {"source-file", required_argument, 0, 'b'}, ++ {"source-format", required_argument, 0, 'F'}, ++ {0, 0, 0, 0} ++ }; ++ c = getopt_long(argc, argv, ":b:f:F:g:h", long_options, NULL); ++ if (c == -1) { ++ break; ++ } ++ ++ switch (c) { ++ case ':': ++ missing_argument(argv[optind - 1]); ++ break; ++ case '?': ++ unrecognized_option(argv[optind - 1]); ++ break; ++ case 'h': ++ help(); ++ break; ++ case 'b': ++ src_filename = optarg; ++ break; ++ case 'f': ++ fmt = optarg; ++ break; ++ case 'F': ++ src_fmt = optarg; ++ break; ++ case 'g': ++ granularity = cvtnum("granularity", optarg); ++ if (granularity < 0) { ++ return 1; ++ } ++ break; ++ case OPTION_ADD: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_ADD; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ add = true; ++ break; ++ case OPTION_REMOVE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_REMOVE; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_CLEAR: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_CLEAR; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_ENABLE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_ENABLE; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_DISABLE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_DISABLE; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_MERGE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_MERGE; ++ act->src = optarg; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ merge = true; ++ break; ++ case OPTION_OBJECT: ++ opts = qemu_opts_parse_noisily(&qemu_object_opts, optarg, true); ++ if (!opts) { ++ goto out; ++ } ++ break; ++ case OPTION_IMAGE_OPTS: ++ image_opts = true; ++ break; ++ } ++ } ++ ++ if (qemu_opts_foreach(&qemu_object_opts, ++ user_creatable_add_opts_foreach, ++ qemu_img_object_print_help, &error_fatal)) { ++ goto out; ++ } ++ ++ if (QSIMPLEQ_EMPTY(&actions)) { ++ error_report("Need at least one of --add, --remove, --clear, " ++ "--enable, --disable, or --merge"); ++ goto out; ++ } ++ ++ if (granularity && !add) { ++ error_report("granularity only supported with --add"); ++ goto out; ++ } ++ if (src_fmt && !src_filename) { ++ error_report("-F only supported with -b"); ++ goto out; ++ } ++ if (src_filename && !merge) { ++ error_report("Merge bitmap source file only supported with " ++ "--merge"); ++ goto out; ++ } ++ ++ if (optind != argc - 2) { ++ error_report("Expecting filename and bitmap name"); ++ goto out; ++ } ++ ++ filename = argv[optind]; ++ bitmap = argv[optind + 1]; ++ ++ blk = img_open(image_opts, filename, fmt, BDRV_O_RDWR, false, false, ++ false); ++ if (!blk) { ++ goto out; ++ } ++ bs = blk_bs(blk); ++ if (src_filename) { ++ src = img_open(false, src_filename, src_fmt, 0, false, false, false); ++ if (!src) { ++ goto out; ++ } ++ src_bs = blk_bs(src); ++ } else { ++ src_bs = bs; ++ } ++ ++ QSIMPLEQ_FOREACH_SAFE(act, &actions, next, act_next) { ++ switch (act->act) { ++ case BITMAP_ADD: ++ qmp_block_dirty_bitmap_add(bs->node_name, bitmap, ++ !!granularity, granularity, true, true, ++ false, false, &err); ++ op = "add"; ++ break; ++ case BITMAP_REMOVE: ++ qmp_block_dirty_bitmap_remove(bs->node_name, bitmap, &err); ++ op = "remove"; ++ break; ++ case BITMAP_CLEAR: ++ qmp_block_dirty_bitmap_clear(bs->node_name, bitmap, &err); ++ op = "clear"; ++ break; ++ case BITMAP_ENABLE: ++ qmp_block_dirty_bitmap_enable(bs->node_name, bitmap, &err); ++ op = "enable"; ++ break; ++ case BITMAP_DISABLE: ++ qmp_block_dirty_bitmap_disable(bs->node_name, bitmap, &err); ++ op = "disable"; ++ break; ++ case BITMAP_MERGE: { ++ BlockDirtyBitmapMergeSource *merge_src; ++ BlockDirtyBitmapMergeSourceList *list; ++ ++ merge_src = g_new0(BlockDirtyBitmapMergeSource, 1); ++ merge_src->type = QTYPE_QDICT; ++ merge_src->u.external.node = g_strdup(src_bs->node_name); ++ merge_src->u.external.name = g_strdup(act->src); ++ list = g_new0(BlockDirtyBitmapMergeSourceList, 1); ++ list->value = merge_src; ++ qmp_block_dirty_bitmap_merge(bs->node_name, bitmap, list, &err); ++ qapi_free_BlockDirtyBitmapMergeSourceList(list); ++ op = "merge"; ++ break; ++ } ++ default: ++ g_assert_not_reached(); ++ } ++ ++ if (err) { ++ error_reportf_err(err, "Operation %s on bitmap %s failed: ", ++ op, bitmap); ++ goto out; ++ } ++ g_free(act); ++ } ++ ++ ret = 0; ++ ++ out: ++ blk_unref(src); ++ blk_unref(blk); ++ qemu_opts_del(opts); ++ return ret; ++} ++ + #define C_BS 01 + #define C_COUNT 02 + #define C_IF 04 +diff --git a/qemu-img.texi b/qemu-img.texi +index b5156d6..abf2771 100644 +--- a/qemu-img.texi ++++ b/qemu-img.texi +@@ -230,6 +230,33 @@ specified as well. + For write tests, by default a buffer filled with zeros is written. This can be + overridden with a pattern byte specified by @var{pattern}. + ++@item bitmap (--merge @var{source} | --add | --remove | --clear | --enable | --disable)... [-b @var{source_file} [-F @var{source_fmt}]] [-g @var{granularity}] [--object @var{objectdef}] [--image-opts | -f @var{fmt}] @var{filename} @var{bitmap} ++ ++Perform one or more modifications of the persistent bitmap @var{bitmap} ++in the disk image @var{filename}. The various modifications are: ++ ++@table @option ++@item add ++create @var{bitmap}, enabled to record future edits. ++@item remove ++remove @var{bitmap}. ++@item clear ++clear @var{bitmap}. ++@item enable ++change @var{bitmap} to start recording future edits. ++@item disable ++change @var{bitmap} to stop recording future edits. ++@item merge @var{source} ++merge the contents of the @var{source} bitmap into @var{bitmap}. ++@end table ++ ++Additional options include @option{-g} which sets a non-default ++@var{granularity} for @option{--add}, and @option{-b} and @option{-F} ++which select an alternative source file for all @var{source} bitmaps used by ++@option{--merge}. ++ ++To see what bitmaps are present in an image, use @code{qemu-img info}. ++ + @item check [--object @var{objectdef}] [--image-opts] [-q] [-f @var{fmt}] [--output=@var{ofmt}] [-r [leaks | all]] [-T @var{src_cache}] [-U] @var{filename} + + Perform a consistency check on the disk image @var{filename}. The command can +-- +1.8.3.1 + diff --git a/kvm-qemu-img-Add-convert-bitmaps-option.patch b/kvm-qemu-img-Add-convert-bitmaps-option.patch new file mode 100755 index 0000000..20eca9f --- /dev/null +++ b/kvm-qemu-img-Add-convert-bitmaps-option.patch @@ -0,0 +1,244 @@ +From f2add7d5955770318824c3eee774bec2dd850936 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:19 +0100 +Subject: [PATCH 14/26] qemu-img: Add convert --bitmaps option + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-12-eblake@redhat.com> +Patchwork-id: 97076 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 11/12] qemu-img: Add convert --bitmaps option +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Make it easier to copy all the persistent bitmaps of (the top layer +of) a source image along with its guest-visible contents, by adding a +boolean flag for use with qemu-img convert. This is basically +shorthand, as the same effect could be accomplished with a series of +'qemu-img bitmap --add' and 'qemu-img bitmap --merge -b source' +commands, or by their corresponding QMP commands. + +Note that this command will fail in the same scenarios where 'qemu-img +measure' omits a 'bitmaps size:' line, namely, when either the source +or the destination lacks persistent bitmap support altogether. + +See also https://bugzilla.redhat.com/show_bug.cgi?id=1779893 + +While touching this, clean up a couple coding issues spotted in the +same function: an extra blank line, and merging back-to-back 'if +(!skip_create)' blocks. + +Signed-off-by: Eric Blake +Message-Id: <20200521192137.1120211-5-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 15e39ad95078d528dfb9a75417453cab60332b77) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + docs/tools/qemu-img.rst - qemu-img.texi instead + qemu-img.c - context: no --target-is-zero + qemu-img-cmds.hx - context: texi instead of rst +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img-cmds.hx | 4 ++-- + qemu-img.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- + qemu-img.texi | 4 +++- + 3 files changed, 72 insertions(+), 6 deletions(-) + +diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx +index 1a6a8e9..48144aa 100644 +--- a/qemu-img-cmds.hx ++++ b/qemu-img-cmds.hx +@@ -50,9 +50,9 @@ STEXI + ETEXI + + DEF("convert", img_convert, +- "convert [--object objectdef] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") ++ "convert [--object objectdef] [--image-opts] [--target-image-opts] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") + STEXI +-@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] [--salvage] @var{filename} [@var{filename2} [...]] @var{output_filename} ++@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] [--salvage] @var{filename} [@var{filename2} [...]] @var{output_filename} + ETEXI + + DEF("create", img_create, +diff --git a/qemu-img.c b/qemu-img.c +index 39e1586..6dc881b 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -77,6 +77,7 @@ enum { + OPTION_ENABLE = 272, + OPTION_DISABLE = 273, + OPTION_MERGE = 274, ++ OPTION_BITMAPS = 275, + }; + + typedef enum OutputFormat { +@@ -190,6 +191,7 @@ static void QEMU_NORETURN help(void) + " hiding corruption that has already occurred.\n" + "\n" + "Parameters to convert subcommand:\n" ++ " '--bitmaps' copies all top-level persistent bitmaps to destination\n" + " '-m' specifies how many coroutines work in parallel during the convert\n" + " process (defaults to 8)\n" + " '-W' allow to write to the target out of order rather than sequential\n" +@@ -2084,6 +2086,39 @@ static int convert_do_copy(ImgConvertState *s) + return s->ret; + } + ++static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState *dst) ++{ ++ BdrvDirtyBitmap *bm; ++ Error *err = NULL; ++ ++ FOR_EACH_DIRTY_BITMAP(src, bm) { ++ const char *name; ++ ++ if (!bdrv_dirty_bitmap_get_persistence(bm)) { ++ continue; ++ } ++ name = bdrv_dirty_bitmap_name(bm); ++ qmp_block_dirty_bitmap_add(dst->node_name, name, ++ true, bdrv_dirty_bitmap_granularity(bm), ++ true, true, ++ true, !bdrv_dirty_bitmap_enabled(bm), ++ &err); ++ if (err) { ++ error_reportf_err(err, "Failed to create bitmap %s: ", name); ++ return -1; ++ } ++ ++ do_dirty_bitmap_merge(dst->node_name, name, src->node_name, name, ++ &err); ++ if (err) { ++ error_reportf_err(err, "Failed to populate bitmap %s: ", name); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + #define MAX_BUF_SECTORS 32768 + + static int img_convert(int argc, char **argv) +@@ -2105,6 +2140,7 @@ static int img_convert(int argc, char **argv) + int64_t ret = -EINVAL; + bool force_share = false; + bool explict_min_sparse = false; ++ bool bitmaps = false; + + ImgConvertState s = (ImgConvertState) { + /* Need at least 4k of zeros for sparse detection */ +@@ -2123,6 +2159,7 @@ static int img_convert(int argc, char **argv) + {"force-share", no_argument, 0, 'U'}, + {"target-image-opts", no_argument, 0, OPTION_TARGET_IMAGE_OPTS}, + {"salvage", no_argument, 0, OPTION_SALVAGE}, ++ {"bitmaps", no_argument, 0, OPTION_BITMAPS}, + {0, 0, 0, 0} + }; + c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WU", +@@ -2248,6 +2285,9 @@ static int img_convert(int argc, char **argv) + case OPTION_TARGET_IMAGE_OPTS: + tgt_image_opts = true; + break; ++ case OPTION_BITMAPS: ++ bitmaps = true; ++ break; + } + } + +@@ -2304,7 +2344,6 @@ static int img_convert(int argc, char **argv) + goto fail_getopt; + } + +- + /* ret is still -EINVAL until here */ + ret = bdrv_parse_cache_mode(src_cache, &src_flags, &src_writethrough); + if (ret < 0) { +@@ -2458,6 +2497,20 @@ static int img_convert(int argc, char **argv) + } + } + ++ /* Determine if bitmaps need copying */ ++ if (bitmaps) { ++ if (s.src_num > 1) { ++ error_report("Copying bitmaps only possible with single source"); ++ ret = -1; ++ goto out; ++ } ++ if (!bdrv_supports_persistent_dirty_bitmap(blk_bs(s.src[0]))) { ++ error_report("Source lacks bitmap support"); ++ ret = -1; ++ goto out; ++ } ++ } ++ + /* + * The later open call will need any decryption secrets, and + * bdrv_create() will purge "opts", so extract them now before +@@ -2466,9 +2519,7 @@ static int img_convert(int argc, char **argv) + if (!skip_create) { + open_opts = qdict_new(); + qemu_opt_foreach(opts, img_add_key_secrets, open_opts, &error_abort); +- } + +- if (!skip_create) { + /* Create the new image */ + ret = bdrv_create(drv, out_filename, opts, &local_err); + if (ret < 0) { +@@ -2506,6 +2557,13 @@ static int img_convert(int argc, char **argv) + } + out_bs = blk_bs(s.target); + ++ if (bitmaps && !bdrv_supports_persistent_dirty_bitmap(out_bs)) { ++ error_report("Format driver '%s' does not support bitmaps", ++ out_bs->drv->format_name); ++ ret = -1; ++ goto out; ++ } ++ + if (s.compressed && !block_driver_can_compress(out_bs->drv)) { + error_report("Compression not supported for this file format"); + ret = -1; +@@ -2565,6 +2623,12 @@ static int img_convert(int argc, char **argv) + } + + ret = convert_do_copy(&s); ++ ++ /* Now copy the bitmaps */ ++ if (bitmaps && ret == 0) { ++ ret = convert_copy_bitmaps(blk_bs(s.src[0]), out_bs); ++ } ++ + out: + if (!ret) { + qemu_progress_print(100, 0); +diff --git a/qemu-img.texi b/qemu-img.texi +index 3670b96..b95d019 100644 +--- a/qemu-img.texi ++++ b/qemu-img.texi +@@ -161,6 +161,8 @@ Parameters to convert subcommand: + + @table @option + ++@item --bitmaps ++Additionally copy all persistent bitmaps from the top layer of the source + @item -n + Skip the creation of the target volume + @item -m +@@ -357,7 +359,7 @@ Error on reading data + + @end table + +-@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename} ++@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename} + + Convert the disk image @var{filename} or a snapshot @var{snapshot_param} + to disk image @var{output_filename} using format @var{output_fmt}. It can be optionally compressed (@code{-c} +-- +1.8.3.1 + diff --git a/kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch b/kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch new file mode 100755 index 0000000..c4012b7 --- /dev/null +++ b/kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch @@ -0,0 +1,89 @@ +From 58816c3709e5058e8805333ca011cc4e793d67ff Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:18 +0100 +Subject: [PATCH 13/26] qemu-img: Factor out code for merging bitmaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-11-eblake@redhat.com> +Patchwork-id: 97078 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 10/12] qemu-img: Factor out code for merging bitmaps +Bugzilla: 1779893 1779904 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +The next patch will add another client that wants to merge dirty +bitmaps; it will be easier to refactor the code to construct the QAPI +struct correctly into a helper function. + +Signed-off-by: Eric Blake +Message-Id: <20200521192137.1120211-4-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 6c729dd832207d7347ecb074912f538e2942f269) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img.c | 34 +++++++++++++++++++++------------- + 1 file changed, 21 insertions(+), 13 deletions(-) + +diff --git a/qemu-img.c b/qemu-img.c +index b57856e..39e1586 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -1582,6 +1582,24 @@ out4: + return ret; + } + ++/* Convenience wrapper around qmp_block_dirty_bitmap_merge */ ++static void do_dirty_bitmap_merge(const char *dst_node, const char *dst_name, ++ const char *src_node, const char *src_name, ++ Error **errp) ++{ ++ BlockDirtyBitmapMergeSource *merge_src; ++ BlockDirtyBitmapMergeSourceList *list; ++ ++ merge_src = g_new0(BlockDirtyBitmapMergeSource, 1); ++ merge_src->type = QTYPE_QDICT; ++ merge_src->u.external.node = g_strdup(src_node); ++ merge_src->u.external.name = g_strdup(src_name); ++ list = g_new0(BlockDirtyBitmapMergeSourceList, 1); ++ list->value = merge_src; ++ qmp_block_dirty_bitmap_merge(dst_node, dst_name, list, errp); ++ qapi_free_BlockDirtyBitmapMergeSourceList(list); ++} ++ + enum ImgConvertBlockStatus { + BLK_DATA, + BLK_ZERO, +@@ -4614,21 +4632,11 @@ static int img_bitmap(int argc, char **argv) + qmp_block_dirty_bitmap_disable(bs->node_name, bitmap, &err); + op = "disable"; + break; +- case BITMAP_MERGE: { +- BlockDirtyBitmapMergeSource *merge_src; +- BlockDirtyBitmapMergeSourceList *list; +- +- merge_src = g_new0(BlockDirtyBitmapMergeSource, 1); +- merge_src->type = QTYPE_QDICT; +- merge_src->u.external.node = g_strdup(src_bs->node_name); +- merge_src->u.external.name = g_strdup(act->src); +- list = g_new0(BlockDirtyBitmapMergeSourceList, 1); +- list->value = merge_src; +- qmp_block_dirty_bitmap_merge(bs->node_name, bitmap, list, &err); +- qapi_free_BlockDirtyBitmapMergeSourceList(list); ++ case BITMAP_MERGE: ++ do_dirty_bitmap_merge(bs->node_name, bitmap, src_bs->node_name, ++ act->src, &err); + op = "merge"; + break; +- } + default: + g_assert_not_reached(); + } +-- +1.8.3.1 + diff --git a/kvm-qemu-img-convert-Don-t-pre-zero-images.patch b/kvm-qemu-img-convert-Don-t-pre-zero-images.patch new file mode 100755 index 0000000..28311f4 --- /dev/null +++ b/kvm-qemu-img-convert-Don-t-pre-zero-images.patch @@ -0,0 +1,73 @@ +From eea45924903f03dc6d8f20576be0a4a84d5acce4 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 10 Feb 2021 10:16:11 -0500 +Subject: [PATCH 4/5] qemu-img convert: Don't pre-zero images + +RH-Author: Kevin Wolf +Message-id: <20210210101611.137928-2-kwolf@redhat.com> +Patchwork-id: 101030 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] qemu-img convert: Don't pre-zero images +Bugzilla: 1855250 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Max Reitz + +Since commit 5a37b60a61c, qemu-img create will pre-zero the target image +if it isn't already zero-initialised (most importantly, for host block +devices, but also iscsi etc.), so that writing explicit zeros wouldn't +be necessary later. + +This could speed up the operation significantly, in particular when the +source image file was only sparsely populated. However, it also means +that some block are written twice: Once when pre-zeroing them, and then +when they are overwritten with actual data. On a full image, the +pre-zeroing is wasted work because everything will be overwritten. + +In practice, write_zeroes typically turns out faster than writing +explicit zero buffers, but slow enough that first zeroing everything and +then overwriting parts can be a significant net loss. + +Meanwhile, qemu-img convert was rewritten in 690c7301600 and zero blocks +are now written to the target using bdrv_co_pwrite_zeroes() if the +target could be pre-zeroed. This way we already make use of the faster +write_zeroes operation, but avoid writing any blocks twice. + +Remove the pre-zeroing because these days this former optimisation has +actually turned into a pessimisation in the common case. + +Reported-by: Nir Soffer +Signed-off-by: Kevin Wolf +Message-Id: <20200622151203.35624-1-kwolf@redhat.com> +Tested-by: Nir Soffer +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit edafc70c0c8510862f2f213a3acf7067113bcd08) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img.c | 9 --------- + 1 file changed, 9 deletions(-) + +diff --git a/qemu-img.c b/qemu-img.c +index a27ad70851..b10dc5129b 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -2029,15 +2029,6 @@ static int convert_do_copy(ImgConvertState *s) + s->has_zero_init = false; + } + +- if (!s->has_zero_init && !s->target_has_backing && +- bdrv_can_write_zeroes_with_unmap(blk_bs(s->target))) +- { +- ret = blk_make_zero(s->target, BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK); +- if (ret == 0) { +- s->has_zero_init = true; +- } +- } +- + /* Allocate buffer for copied data. For compressed images, only one cluster + * can be copied at a time. */ + if (s->compressed) { +-- +2.27.0 + diff --git a/kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch b/kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch new file mode 100755 index 0000000..b4180b9 --- /dev/null +++ b/kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch @@ -0,0 +1,241 @@ +From 1a8a4ece5def912e7cfa5ef8565fc8ecef6e72c3 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:11 +0100 +Subject: [PATCH 06/26] qemu_img: add cvtnum_full to print error reports + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-4-eblake@redhat.com> +Patchwork-id: 97067 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 03/12] qemu_img: add cvtnum_full to print error reports +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +From: Eyal Moscovici + +All calls to cvtnum check the return value and print the same error +message more or less. And so error reporting moved to cvtnum_full to +reduce code duplication and provide a single error +message. Additionally, cvtnum now wraps cvtnum_full with the existing +default range of 0 to MAX_INT64. + +Acked-by: Mark Kanda +Signed-off-by: Eyal Moscovici +Message-Id: <20200513133629.18508-2-eyal.moscovici@oracle.com> +Reviewed-by: Eric Blake +[eblake: fix printf formatting, avoid trailing space, change error wording, +reformat commit message] +Signed-off-by: Eric Blake +(cherry picked from commit 43d589b074370ebc9b340340b5f641b385da9df8) +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img.c | 76 +++++++++++++++++++++------------------------- + tests/qemu-iotests/049.out | 8 ++--- + 2 files changed, 38 insertions(+), 46 deletions(-) + +diff --git a/qemu-img.c b/qemu-img.c +index 95a24b9..e69529b 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -422,19 +422,31 @@ static int add_old_style_options(const char *fmt, QemuOpts *opts, + return 0; + } + +-static int64_t cvtnum(const char *s) ++static int64_t cvtnum_full(const char *name, const char *value, int64_t min, ++ int64_t max) + { + int err; +- uint64_t value; +- +- err = qemu_strtosz(s, NULL, &value); +- if (err < 0) { ++ uint64_t res; ++ ++ err = qemu_strtosz(value, NULL, &res); ++ if (err < 0 && err != -ERANGE) { ++ error_report("Invalid %s specified. You may use " ++ "k, M, G, T, P or E suffixes for", name); ++ error_report("kilobytes, megabytes, gigabytes, terabytes, " ++ "petabytes and exabytes."); + return err; + } +- if (value > INT64_MAX) { ++ if (err == -ERANGE || res > max || res < min) { ++ error_report("Invalid %s specified. Must be between %" PRId64 ++ " and %" PRId64 ".", name, min, max); + return -ERANGE; + } +- return value; ++ return res; ++} ++ ++static int64_t cvtnum(const char *name, const char *value) ++{ ++ return cvtnum_full(name, value, 0, INT64_MAX); + } + + static int img_create(int argc, char **argv) +@@ -532,16 +544,8 @@ static int img_create(int argc, char **argv) + if (optind < argc) { + int64_t sval; + +- sval = cvtnum(argv[optind++]); ++ sval = cvtnum("image size", argv[optind++]); + if (sval < 0) { +- if (sval == -ERANGE) { +- error_report("Image size must be less than 8 EiB!"); +- } else { +- error_report("Invalid image size specified! You may use k, M, " +- "G, T, P or E suffixes for "); +- error_report("kilobytes, megabytes, gigabytes, terabytes, " +- "petabytes and exabytes."); +- } + goto fail; + } + img_size = (uint64_t)sval; +@@ -2148,8 +2152,10 @@ static int img_convert(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); +- if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) || ++ sval = cvtnum("buffer size for sparse output", optarg); ++ if (sval < 0) { ++ goto fail_getopt; ++ } else if (!QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) || + sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) { + error_report("Invalid buffer size for sparse output specified. " + "Valid sizes are multiples of %llu up to %llu. Select " +@@ -4229,9 +4235,8 @@ static int img_bench(int argc, char **argv) + break; + case 'o': + { +- offset = cvtnum(optarg); ++ offset = cvtnum("offset", optarg); + if (offset < 0) { +- error_report("Invalid offset specified"); + return 1; + } + break; +@@ -4244,9 +4249,8 @@ static int img_bench(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); +- if (sval < 0 || sval > INT_MAX) { +- error_report("Invalid buffer size specified"); ++ sval = cvtnum_full("buffer size", optarg, 0, INT_MAX); ++ if (sval < 0) { + return 1; + } + +@@ -4257,9 +4261,8 @@ static int img_bench(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); +- if (sval < 0 || sval > INT_MAX) { +- error_report("Invalid step size specified"); ++ sval = cvtnum_full("step_size", optarg, 0, INT_MAX); ++ if (sval < 0) { + return 1; + } + +@@ -4429,10 +4432,9 @@ static int img_dd_bs(const char *arg, + { + int64_t res; + +- res = cvtnum(arg); ++ res = cvtnum_full("bs", arg, 1, INT_MAX); + +- if (res <= 0 || res > INT_MAX) { +- error_report("invalid number: '%s'", arg); ++ if (res < 0) { + return 1; + } + in->bsz = out->bsz = res; +@@ -4444,10 +4446,9 @@ static int img_dd_count(const char *arg, + struct DdIo *in, struct DdIo *out, + struct DdInfo *dd) + { +- dd->count = cvtnum(arg); ++ dd->count = cvtnum("count", arg); + + if (dd->count < 0) { +- error_report("invalid number: '%s'", arg); + return 1; + } + +@@ -4476,10 +4477,9 @@ static int img_dd_skip(const char *arg, + struct DdIo *in, struct DdIo *out, + struct DdInfo *dd) + { +- in->offset = cvtnum(arg); ++ in->offset = cvtnum("skip", arg); + + if (in->offset < 0) { +- error_report("invalid number: '%s'", arg); + return 1; + } + +@@ -4869,16 +4869,8 @@ static int img_measure(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); ++ sval = cvtnum("image size", optarg); + if (sval < 0) { +- if (sval == -ERANGE) { +- error_report("Image size must be less than 8 EiB!"); +- } else { +- error_report("Invalid image size specified! You may use " +- "k, M, G, T, P or E suffixes for "); +- error_report("kilobytes, megabytes, gigabytes, terabytes, " +- "petabytes and exabytes."); +- } + goto out; + } + img_size = (uint64_t)sval; +diff --git a/tests/qemu-iotests/049.out b/tests/qemu-iotests/049.out +index 6b50540..8b35f3d 100644 +--- a/tests/qemu-iotests/049.out ++++ b/tests/qemu-iotests/049.out +@@ -92,19 +92,19 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=1649267441664 cluster_size=65536 l + == 3. Invalid sizes == + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1024 +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + + qemu-img create -f qcow2 -o size=-1024 TEST_DIR/t.qcow2 + qemu-img: TEST_DIR/t.qcow2: Value '-1024' is out of range for parameter 'size' + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1k +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + + qemu-img create -f qcow2 -o size=-1k TEST_DIR/t.qcow2 + qemu-img: TEST_DIR/t.qcow2: Value '-1k' is out of range for parameter 'size' + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- 1kilobyte +-qemu-img: Invalid image size specified! You may use k, M, G, T, P or E suffixes for ++qemu-img: Invalid image size specified. You may use k, M, G, T, P or E suffixes for + qemu-img: kilobytes, megabytes, gigabytes, terabytes, petabytes and exabytes. + + qemu-img create -f qcow2 -o size=1kilobyte TEST_DIR/t.qcow2 +@@ -113,7 +113,7 @@ Optional suffix k, M, G, T, P or E means kilo-, mega-, giga-, tera-, peta- + and exabytes, respectively. + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- foobar +-qemu-img: Invalid image size specified! You may use k, M, G, T, P or E suffixes for ++qemu-img: Invalid image size specified. You may use k, M, G, T, P or E suffixes for + qemu-img: kilobytes, megabytes, gigabytes, terabytes, petabytes and exabytes. + + qemu-img create -f qcow2 -o size=foobar TEST_DIR/t.qcow2 +-- +1.8.3.1 + diff --git a/kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch b/kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch new file mode 100755 index 0000000..3b533a5 --- /dev/null +++ b/kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch @@ -0,0 +1,73 @@ +From c5f90436555d7ab2c1c28bf1cfdb5f5f8ca97816 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 24 Dec 2020 12:53:04 -0500 +Subject: [PATCH 4/5] qga: Use qemu_get_host_name() instead of + g_get_host_name() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201224125304.62697-4-marcandre.lureau@redhat.com> +Patchwork-id: 100500 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/3] qga: Use qemu_get_host_name() instead of g_get_host_name() +Bugzilla: 1910326 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Michal Privoznik + +Problem with g_get_host_name() is that on the first call it saves +the hostname into a global variable and from then on, every +subsequent call returns the saved hostname. Even if the hostname +changes. This doesn't play nicely with guest agent, because if +the hostname is acquired before the guest is set up (e.g. on the +first boot, or before DHCP) we will report old, invalid hostname. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1845127 + +Signed-off-by: Michal Privoznik +Reviewed-by: Daniel P. Berrangé +Cc: qemu-stable@nongnu.org +Signed-off-by: Michael Roth + +(cherry picked from commit 0d3a8f32b1e0eca279da1b0cc793efc7250c3daf) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +diff --git a/qga/commands.c b/qga/commands.c +index 43c323ceada..93bed292d08 100644 +--- a/qga/commands.c ++++ b/qga/commands.c +@@ -502,11 +502,20 @@ int ga_parse_whence(GuestFileWhence *whence, Error **errp) + GuestHostName *qmp_guest_get_host_name(Error **errp) + { + GuestHostName *result = NULL; +- gchar const *hostname = g_get_host_name(); +- if (hostname != NULL) { +- result = g_new0(GuestHostName, 1); +- result->host_name = g_strdup(hostname); ++ g_autofree char *hostname = qemu_get_host_name(errp); ++ ++ /* ++ * We want to avoid using g_get_host_name() because that ++ * caches the result and we wouldn't reflect changes in the ++ * host name. ++ */ ++ ++ if (!hostname) { ++ hostname = g_strdup("localhost"); + } ++ ++ result = g_new0(GuestHostName, 1); ++ result->host_name = g_steal_pointer(&hostname); + return result; + } + +-- +2.27.0 + diff --git a/kvm-qga-add-command-guest-get-disks.patch b/kvm-qga-add-command-guest-get-disks.patch new file mode 100755 index 0000000..360301d --- /dev/null +++ b/kvm-qga-add-command-guest-get-disks.patch @@ -0,0 +1,115 @@ +From 58688d868656e77f67ea915544b0bb3bb60f33d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:11 -0500 +Subject: [PATCH 10/14] qga: add command guest-get-disks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-7-marcandre.lureau@redhat.com> +Patchwork-id: 100475 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 06/10] qga: add command guest-get-disks +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Tomáš Golembiovský + +Add API and stubs for new guest-get-disks command. + +The command guest-get-fsinfo can be used to list information about disks +and partitions but it is limited only to mounted disks with filesystem. +This new command should allow listing information about disks of the VM +regardles whether they are mounted or not. This can be usefull for +management applications for mapping virtualized devices or pass-through +devices to device names in the guest OS. + +Signed-off-by: Tomáš Golembiovský +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Marc-André Lureau +Signed-off-by: Michael Roth + +(cherry-picked from commit c27ea3f9ef7c7f29e55bde91879f8514abce9c38) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 6 ++++++ + qga/commands-win32.c | 6 ++++++ + qga/qapi-schema.json | 31 +++++++++++++++++++++++++++++++ + 3 files changed, 43 insertions(+) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index c86c87ed522..5095104afc0 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -3039,3 +3039,9 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index 55ba5b263af..be63fa2b208 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -2234,3 +2234,9 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index fb4605cc19c..22df375c92f 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -852,6 +852,37 @@ + 'bus': 'int', 'target': 'int', 'unit': 'int', + '*serial': 'str', '*dev': 'str'} } + ++## ++# @GuestDiskInfo: ++# ++# @name: device node (Linux) or device UNC (Windows) ++# @partition: whether this is a partition or disk ++# @dependents: list of dependent devices; e.g. for LVs of the LVM this will ++# hold the list of PVs, for LUKS encrypted volume this will ++# contain the disk where the volume is placed. (Linux) ++# @address: disk address information (only for non-virtual devices) ++# @alias: optional alias assigned to the disk, on Linux this is a name assigned ++# by device mapper ++# ++# Since 5.2 ++## ++{ 'struct': 'GuestDiskInfo', ++ 'data': {'name': 'str', 'partition': 'bool', 'dependents': ['str'], ++ '*address': 'GuestDiskAddress', '*alias': 'str'} } ++ ++## ++# @guest-get-disks: ++# ++# Returns: The list of disks in the guest. For Windows these are only the ++# physical disks. On Linux these are all root block devices of ++# non-zero size including e.g. removable devices, loop devices, ++# NBD, etc. ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-get-disks', ++ 'returns': ['GuestDiskInfo'] } ++ + ## + # @GuestFilesystemInfo: + # +-- +2.27.0 + diff --git a/kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch b/kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch new file mode 100755 index 0000000..939a212 --- /dev/null +++ b/kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch @@ -0,0 +1,427 @@ +From 086957b970a8f4165249589e2bc0cc08d1800db3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:12 -0500 +Subject: [PATCH 11/14] qga: add implementation of guest-get-disks for Linux +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-8-marcandre.lureau@redhat.com> +Patchwork-id: 100478 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 07/10] qga: add implementation of guest-get-disks for Linux +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Tomáš Golembiovský + +The command lists all disks (real and virtual) as well as disk +partitions. For each disk the list of dependent disks is also listed and +/dev path is used as a handle so it can be matched with "name" field of +other returned disk entries. For disk partitions the "dependents" list +is populated with the the parent device for easier tracking of +hierarchy. + +Example output: +{ + "return": [ + ... + { + "name": "/dev/dm-0", + "partition": false, + "dependents": [ + "/dev/sda2" + ], + "alias": "luks-7062202e-5b9b-433e-81e8-6628c40da9f7" + }, + { + "name": "/dev/sda2", + "partition": true, + "dependents": [ + "/dev/sda" + ] + }, + { + "name": "/dev/sda", + "partition": false, + "address": { + "serial": "SAMSUNG_MZ7LN512HCHP-000L1_S1ZKNXAG822493", + "bus-type": "sata", + ... + "dev": "/dev/sda", + "target": 0 + }, + "dependents": [] + }, + ... + ] +} + +Signed-off-by: Tomáš Golembiovský +Reviewed-by: Marc-André Lureau +*add missing stub for !defined(CONFIG_FSFREEZE) +*remove unused deps_dir variable +Signed-off-by: Michael Roth +(cherry picked from commit fed3956429d560a06fc2d2fcf1a01efb58659f87) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 303 +++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 292 insertions(+), 11 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 5095104afc0..96f5ddafd3a 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1152,13 +1152,27 @@ static void build_guest_fsinfo_for_virtual_device(char const *syspath, + closedir(dir); + } + ++static bool is_disk_virtual(const char *devpath, Error **errp) ++{ ++ g_autofree char *syspath = realpath(devpath, NULL); ++ ++ if (!syspath) { ++ error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); ++ return false; ++ } ++ return strstr(syspath, "/devices/virtual/block/") != NULL; ++} ++ + /* Dispatch to functions for virtual/real device */ + static void build_guest_fsinfo_for_device(char const *devpath, + GuestFilesystemInfo *fs, + Error **errp) + { +- char *syspath = realpath(devpath, NULL); ++ ERRP_GUARD(); ++ g_autofree char *syspath = NULL; ++ bool is_virtual = false; + ++ syspath = realpath(devpath, NULL); + if (!syspath) { + error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); + return; +@@ -1169,16 +1183,281 @@ static void build_guest_fsinfo_for_device(char const *devpath, + } + + g_debug(" parse sysfs path '%s'", syspath); +- +- if (strstr(syspath, "/devices/virtual/block/")) { ++ is_virtual = is_disk_virtual(syspath, errp); ++ if (*errp != NULL) { ++ return; ++ } ++ if (is_virtual) { + build_guest_fsinfo_for_virtual_device(syspath, fs, errp); + } else { + build_guest_fsinfo_for_real_device(syspath, fs, errp); + } ++} ++ ++#ifdef CONFIG_LIBUDEV ++ ++/* ++ * Wrapper around build_guest_fsinfo_for_device() for getting just ++ * the disk address. ++ */ ++static GuestDiskAddress *get_disk_address(const char *syspath, Error **errp) ++{ ++ g_autoptr(GuestFilesystemInfo) fs = NULL; + +- free(syspath); ++ fs = g_new0(GuestFilesystemInfo, 1); ++ build_guest_fsinfo_for_device(syspath, fs, errp); ++ if (fs->disk != NULL) { ++ return g_steal_pointer(&fs->disk->value); ++ } ++ return NULL; + } + ++static char *get_alias_for_syspath(const char *syspath) ++{ ++ struct udev *udev = NULL; ++ struct udev_device *udevice = NULL; ++ char *ret = NULL; ++ ++ udev = udev_new(); ++ if (udev == NULL) { ++ g_debug("failed to query udev"); ++ goto out; ++ } ++ udevice = udev_device_new_from_syspath(udev, syspath); ++ if (udevice == NULL) { ++ g_debug("failed to query udev for path: %s", syspath); ++ goto out; ++ } else { ++ const char *alias = udev_device_get_property_value( ++ udevice, "DM_NAME"); ++ /* ++ * NULL means there was an error and empty string means there is no ++ * alias. In case of no alias we return NULL instead of empty string. ++ */ ++ if (alias == NULL) { ++ g_debug("failed to query udev for device alias for: %s", ++ syspath); ++ } else if (*alias != 0) { ++ ret = g_strdup(alias); ++ } ++ } ++ ++out: ++ udev_unref(udev); ++ udev_device_unref(udevice); ++ return ret; ++} ++ ++static char *get_device_for_syspath(const char *syspath) ++{ ++ struct udev *udev = NULL; ++ struct udev_device *udevice = NULL; ++ char *ret = NULL; ++ ++ udev = udev_new(); ++ if (udev == NULL) { ++ g_debug("failed to query udev"); ++ goto out; ++ } ++ udevice = udev_device_new_from_syspath(udev, syspath); ++ if (udevice == NULL) { ++ g_debug("failed to query udev for path: %s", syspath); ++ goto out; ++ } else { ++ ret = g_strdup(udev_device_get_devnode(udevice)); ++ } ++ ++out: ++ udev_unref(udev); ++ udev_device_unref(udevice); ++ return ret; ++} ++ ++static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) ++{ ++ g_autofree char *deps_dir = NULL; ++ const gchar *dep; ++ GDir *dp_deps = NULL; ++ ++ /* List dependent disks */ ++ deps_dir = g_strdup_printf("%s/slaves", disk_dir); ++ g_debug(" listing entries in: %s", deps_dir); ++ dp_deps = g_dir_open(deps_dir, 0, NULL); ++ if (dp_deps == NULL) { ++ g_debug("failed to list entries in %s", deps_dir); ++ return; ++ } ++ while ((dep = g_dir_read_name(dp_deps)) != NULL) { ++ g_autofree char *dep_dir = NULL; ++ strList *dep_item = NULL; ++ char *dev_name; ++ ++ /* Add dependent disks */ ++ dep_dir = g_strdup_printf("%s/%s", deps_dir, dep); ++ dev_name = get_device_for_syspath(dep_dir); ++ if (dev_name != NULL) { ++ g_debug(" adding dependent device: %s", dev_name); ++ dep_item = g_new0(strList, 1); ++ dep_item->value = dev_name; ++ dep_item->next = disk->dependents; ++ disk->dependents = dep_item; ++ } ++ } ++ g_dir_close(dp_deps); ++} ++ ++/* ++ * Detect partitions subdirectory, name is "" or ++ * "p" ++ * ++ * @disk_name -- last component of /sys path (e.g. sda) ++ * @disk_dir -- sys path of the disk (e.g. /sys/block/sda) ++ * @disk_dev -- device node of the disk (e.g. /dev/sda) ++ */ ++static GuestDiskInfoList *get_disk_partitions( ++ GuestDiskInfoList *list, ++ const char *disk_name, const char *disk_dir, ++ const char *disk_dev) ++{ ++ GuestDiskInfoList *item, *ret = list; ++ struct dirent *de_disk; ++ DIR *dp_disk = NULL; ++ size_t len = strlen(disk_name); ++ ++ dp_disk = opendir(disk_dir); ++ while ((de_disk = readdir(dp_disk)) != NULL) { ++ g_autofree char *partition_dir = NULL; ++ char *dev_name; ++ GuestDiskInfo *partition; ++ ++ if (!(de_disk->d_type & DT_DIR)) { ++ continue; ++ } ++ ++ if (!(strncmp(disk_name, de_disk->d_name, len) == 0 && ++ ((*(de_disk->d_name + len) == 'p' && ++ isdigit(*(de_disk->d_name + len + 1))) || ++ isdigit(*(de_disk->d_name + len))))) { ++ continue; ++ } ++ ++ partition_dir = g_strdup_printf("%s/%s", ++ disk_dir, de_disk->d_name); ++ dev_name = get_device_for_syspath(partition_dir); ++ if (dev_name == NULL) { ++ g_debug("Failed to get device name for syspath: %s", ++ disk_dir); ++ continue; ++ } ++ partition = g_new0(GuestDiskInfo, 1); ++ partition->name = dev_name; ++ partition->partition = true; ++ /* Add parent disk as dependent for easier tracking of hierarchy */ ++ partition->dependents = g_new0(strList, 1); ++ partition->dependents->value = g_strdup(disk_dev); ++ ++ item = g_new0(GuestDiskInfoList, 1); ++ item->value = partition; ++ item->next = ret; ++ ret = item; ++ ++ } ++ closedir(dp_disk); ++ ++ return ret; ++} ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ GuestDiskInfoList *item, *ret = NULL; ++ GuestDiskInfo *disk; ++ DIR *dp = NULL; ++ struct dirent *de = NULL; ++ ++ g_debug("listing /sys/block directory"); ++ dp = opendir("/sys/block"); ++ if (dp == NULL) { ++ error_setg_errno(errp, errno, "Can't open directory \"/sys/block\""); ++ return NULL; ++ } ++ while ((de = readdir(dp)) != NULL) { ++ g_autofree char *disk_dir = NULL, *line = NULL, ++ *size_path = NULL; ++ char *dev_name; ++ Error *local_err = NULL; ++ if (de->d_type != DT_LNK) { ++ g_debug(" skipping entry: %s", de->d_name); ++ continue; ++ } ++ ++ /* Check size and skip zero-sized disks */ ++ g_debug(" checking disk size"); ++ size_path = g_strdup_printf("/sys/block/%s/size", de->d_name); ++ if (!g_file_get_contents(size_path, &line, NULL, NULL)) { ++ g_debug(" failed to read disk size"); ++ continue; ++ } ++ if (g_strcmp0(line, "0\n") == 0) { ++ g_debug(" skipping zero-sized disk"); ++ continue; ++ } ++ ++ g_debug(" adding %s", de->d_name); ++ disk_dir = g_strdup_printf("/sys/block/%s", de->d_name); ++ dev_name = get_device_for_syspath(disk_dir); ++ if (dev_name == NULL) { ++ g_debug("Failed to get device name for syspath: %s", ++ disk_dir); ++ continue; ++ } ++ disk = g_new0(GuestDiskInfo, 1); ++ disk->name = dev_name; ++ disk->partition = false; ++ disk->alias = get_alias_for_syspath(disk_dir); ++ disk->has_alias = (disk->alias != NULL); ++ item = g_new0(GuestDiskInfoList, 1); ++ item->value = disk; ++ item->next = ret; ++ ret = item; ++ ++ /* Get address for non-virtual devices */ ++ bool is_virtual = is_disk_virtual(disk_dir, &local_err); ++ if (local_err != NULL) { ++ g_debug(" failed to check disk path, ignoring error: %s", ++ error_get_pretty(local_err)); ++ error_free(local_err); ++ local_err = NULL; ++ /* Don't try to get the address */ ++ is_virtual = true; ++ } ++ if (!is_virtual) { ++ disk->address = get_disk_address(disk_dir, &local_err); ++ if (local_err != NULL) { ++ g_debug(" failed to get device info, ignoring error: %s", ++ error_get_pretty(local_err)); ++ error_free(local_err); ++ local_err = NULL; ++ } else if (disk->address != NULL) { ++ disk->has_address = true; ++ } ++ } ++ ++ get_disk_deps(disk_dir, disk); ++ ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name); ++ } ++ return ret; ++} ++ ++#else ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} ++ ++#endif ++ + /* Return a list of the disk device(s)' info which @mount lies on */ + static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, + Error **errp) +@@ -2770,6 +3049,13 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) + + return 0; + } ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} ++ + #endif /* CONFIG_FSFREEZE */ + + #if !defined(CONFIG_FSTRIM) +@@ -2806,7 +3092,8 @@ GList *ga_command_blacklist_init(GList *blacklist) + const char *list[] = { + "guest-get-fsinfo", "guest-fsfreeze-status", + "guest-fsfreeze-freeze", "guest-fsfreeze-freeze-list", +- "guest-fsfreeze-thaw", "guest-get-fsinfo", NULL}; ++ "guest-fsfreeze-thaw", "guest-get-fsinfo", ++ "guest-get-disks", NULL}; + char **p = (char **)list; + + while (*p) { +@@ -3039,9 +3326,3 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } +- +-GuestDiskInfoList *qmp_guest_get_disks(Error **errp) +-{ +- error_setg(errp, QERR_UNSUPPORTED); +- return NULL; +-} +-- +2.27.0 + diff --git a/kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch b/kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch new file mode 100755 index 0000000..f82d95d --- /dev/null +++ b/kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch @@ -0,0 +1,181 @@ +From 925163bf8498e26c19742dbd34b6b324e49c07b6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:13 -0500 +Subject: [PATCH 12/14] qga: add implementation of guest-get-disks for Windows +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-9-marcandre.lureau@redhat.com> +Patchwork-id: 100479 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 08/10] qga: add implementation of guest-get-disks for Windows +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Tomáš Golembiovský + +The command lists all the physical disk drives. Unlike for Linux +partitions and virtual volumes are not listed. + +Example output: + +{ + "return": [ + { + "name": "\\\\.\\PhysicalDrive0", + "partition": false, + "address": { + "serial": "QM00001", + "bus-type": "sata", + ... + }, + "dependents": [] + } + ] +} + +Signed-off-by: Tomáš Golembiovský +Signed-off-by: Michael Roth + +(cherry picked from commit c67d2efd9d1771fd886e3b58771adaa62897f3d9) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-win32.c | 107 ++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 101 insertions(+), 6 deletions(-) + +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index be63fa2b208..a07725e874b 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -960,6 +960,101 @@ out: + return list; + } + ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ ERRP_GUARD(); ++ GuestDiskInfoList *new = NULL, *ret = NULL; ++ HDEVINFO dev_info; ++ SP_DEVICE_INTERFACE_DATA dev_iface_data; ++ int i; ++ ++ dev_info = SetupDiGetClassDevs(&GUID_DEVINTERFACE_DISK, 0, 0, ++ DIGCF_PRESENT | DIGCF_DEVICEINTERFACE); ++ if (dev_info == INVALID_HANDLE_VALUE) { ++ error_setg_win32(errp, GetLastError(), "failed to get device tree"); ++ return NULL; ++ } ++ ++ g_debug("enumerating devices"); ++ dev_iface_data.cbSize = sizeof(SP_DEVICE_INTERFACE_DATA); ++ for (i = 0; ++ SetupDiEnumDeviceInterfaces(dev_info, NULL, &GUID_DEVINTERFACE_DISK, ++ i, &dev_iface_data); ++ i++) { ++ GuestDiskAddress *address = NULL; ++ GuestDiskInfo *disk = NULL; ++ Error *local_err = NULL; ++ g_autofree PSP_DEVICE_INTERFACE_DETAIL_DATA ++ pdev_iface_detail_data = NULL; ++ STORAGE_DEVICE_NUMBER sdn; ++ HANDLE dev_file; ++ DWORD size = 0; ++ BOOL result; ++ int attempt; ++ ++ g_debug(" getting device path"); ++ for (attempt = 0, result = FALSE; attempt < 2 && !result; attempt++) { ++ result = SetupDiGetDeviceInterfaceDetail(dev_info, ++ &dev_iface_data, pdev_iface_detail_data, size, &size, NULL); ++ if (result) { ++ break; ++ } ++ if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { ++ pdev_iface_detail_data = g_realloc(pdev_iface_detail_data, ++ size); ++ pdev_iface_detail_data->cbSize = ++ sizeof(*pdev_iface_detail_data); ++ } else { ++ g_debug("failed to get device interface details"); ++ break; ++ } ++ } ++ if (!result) { ++ g_debug("skipping device"); ++ continue; ++ } ++ ++ g_debug(" device: %s", pdev_iface_detail_data->DevicePath); ++ dev_file = CreateFile(pdev_iface_detail_data->DevicePath, 0, ++ FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL); ++ if (!DeviceIoControl(dev_file, IOCTL_STORAGE_GET_DEVICE_NUMBER, ++ NULL, 0, &sdn, sizeof(sdn), &size, NULL)) { ++ CloseHandle(dev_file); ++ debug_error("failed to get storage device number"); ++ continue; ++ } ++ CloseHandle(dev_file); ++ ++ disk = g_new0(GuestDiskInfo, 1); ++ disk->name = g_strdup_printf("\\\\.\\PhysicalDrive%lu", ++ sdn.DeviceNumber); ++ ++ g_debug(" number: %lu", sdn.DeviceNumber); ++ address = g_malloc0(sizeof(GuestDiskAddress)); ++ address->has_dev = true; ++ address->dev = g_strdup(disk->name); ++ get_single_disk_info(sdn.DeviceNumber, address, &local_err); ++ if (local_err) { ++ g_debug("failed to get disk info: %s", ++ error_get_pretty(local_err)); ++ error_free(local_err); ++ qapi_free_GuestDiskAddress(address); ++ address = NULL; ++ } else { ++ disk->address = address; ++ disk->has_address = true; ++ } ++ ++ new = g_malloc0(sizeof(GuestDiskInfoList)); ++ new->value = disk; ++ new->next = ret; ++ ret = new; ++ } ++ ++ SetupDiDestroyDeviceInfoList(dev_info); ++ return ret; ++} ++ + #else + + static GuestDiskAddressList *build_guest_disk_info(char *guid, Error **errp) +@@ -967,6 +1062,12 @@ static GuestDiskAddressList *build_guest_disk_info(char *guid, Error **errp) + return NULL; + } + ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} ++ + #endif /* CONFIG_QGA_NTDDSCSI */ + + static GuestFilesystemInfo *build_guest_fsinfo(char *guid, Error **errp) +@@ -2234,9 +2335,3 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } +- +-GuestDiskInfoList *qmp_guest_get_disks(Error **errp) +-{ +- error_setg(errp, QERR_UNSUPPORTED); +- return NULL; +-} +-- +2.27.0 + diff --git a/kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch b/kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch new file mode 100755 index 0000000..dec7f7b --- /dev/null +++ b/kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch @@ -0,0 +1,176 @@ +From 7f8888f2c53060c4536856859d5ea94d23ea9e45 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:54 -0400 +Subject: [PATCH 03/14] qga: add *reset argument to ssh-add-authorized-keys +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-4-marcandre.lureau@redhat.com> +Patchwork-id: 101689 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 3/4] qga: add *reset argument to ssh-add-authorized-keys +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Michael Roth + +I prefer 'reset' over 'clear', since 'clear' and keys may have some +other relations or meaning. + +Signed-off-by: Marc-André Lureau +*fix disallowed g_assert* usage reported by checkpatch +Signed-off-by: Michael Roth + +(cherry picked from commit 0e3c94758e3851f0ab30d2a1e63a73284499775d) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + qga/commands-posix-ssh.c | 53 ++++++++++++++++++++++++++++++++++++---- + qga/qapi-schema.json | 3 ++- + 2 files changed, 50 insertions(+), 6 deletions(-) + +diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c +index f74d89679c..362c9e8816 100644 +--- a/qga/commands-posix-ssh.c ++++ b/qga/commands-posix-ssh.c +@@ -168,6 +168,7 @@ read_authkeys(const char *path, Error **errp) + + void + qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, ++ bool has_reset, bool reset, + Error **errp) + { + g_autofree struct passwd *p = NULL; +@@ -178,6 +179,7 @@ qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, + size_t nkeys, nauthkeys; + + ERRP_GUARD(); ++ reset = has_reset && reset; + + if (!check_openssh_pub_keys(keys, &nkeys, errp)) { + return; +@@ -191,7 +193,9 @@ qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, + ssh_path = g_build_filename(p->pw_dir, ".ssh", NULL); + authkeys_path = g_build_filename(ssh_path, "authorized_keys", NULL); + +- authkeys = read_authkeys(authkeys_path, NULL); ++ if (!reset) { ++ authkeys = read_authkeys(authkeys_path, NULL); ++ } + if (authkeys == NULL) { + if (!g_file_test(ssh_path, G_FILE_TEST_IS_DIR) && + !mkdir_for_user(ssh_path, p, 0700, errp)) { +@@ -318,7 +322,7 @@ test_invalid_user(void) + { + Error *err = NULL; + +- qmp_guest_ssh_add_authorized_keys("", NULL, &err); ++ qmp_guest_ssh_add_authorized_keys("", NULL, FALSE, FALSE, &err); + error_free_or_abort(&err); + + qmp_guest_ssh_remove_authorized_keys("", NULL, &err); +@@ -333,7 +337,8 @@ test_invalid_key(void) + }; + Error *err = NULL; + +- qmp_guest_ssh_add_authorized_keys(g_get_user_name(), &key, &err); ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), &key, ++ FALSE, FALSE, &err); + error_free_or_abort(&err); + + qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), &key, &err); +@@ -346,13 +351,17 @@ test_add_keys(void) + Error *err = NULL; + + qmp_guest_ssh_add_authorized_keys(g_get_user_name(), +- (strList *)&test_key2, &err); ++ (strList *)&test_key2, ++ FALSE, FALSE, ++ &err); + g_assert(err == NULL); + + test_authorized_keys_equal("algo key2 comments"); + + qmp_guest_ssh_add_authorized_keys(g_get_user_name(), +- (strList *)&test_key1_2, &err); ++ (strList *)&test_key1_2, ++ FALSE, FALSE, ++ &err); + g_assert(err == NULL); + + /* key2 came first, and should'nt be duplicated */ +@@ -360,6 +369,39 @@ test_add_keys(void) + "algo key1 comments"); + } + ++static void ++test_add_reset_keys(void) ++{ ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key1_2, ++ FALSE, FALSE, ++ &err); ++ g_assert(err == NULL); ++ ++ /* reset with key2 only */ ++ test_authorized_keys_equal("algo key1 comments\n" ++ "algo key2 comments"); ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key2, ++ TRUE, TRUE, ++ &err); ++ g_assert(err == NULL); ++ ++ test_authorized_keys_equal("algo key2 comments"); ++ ++ /* empty should clear file */ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)NULL, ++ TRUE, TRUE, ++ &err); ++ g_assert(err == NULL); ++ ++ test_authorized_keys_equal(""); ++} ++ + static void + test_remove_keys(void) + { +@@ -393,6 +435,7 @@ int main(int argc, char *argv[]) + g_test_add_func("/qga/ssh/invalid_user", test_invalid_user); + g_test_add_func("/qga/ssh/invalid_key", test_invalid_key); + g_test_add_func("/qga/ssh/add_keys", test_add_keys); ++ g_test_add_func("/qga/ssh/add_reset_keys", test_add_reset_keys); + g_test_add_func("/qga/ssh/remove_keys", test_remove_keys); + + return g_test_run(); +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 3b85f5a03f..a70ea5da77 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1279,6 +1279,7 @@ + # + # @username: the user account to add the authorized keys + # @keys: the public keys to add (in OpenSSH/sshd(8) authorized_keys format) ++# @reset: ignore the existing content, set it with the given keys only + # + # Append public keys to user .ssh/authorized_keys on Unix systems (not + # implemented for other systems). +@@ -1288,7 +1289,7 @@ + # Since: 5.2 + ## + { 'command': 'guest-ssh-add-authorized-keys', +- 'data': { 'username': 'str', 'keys': ['str'] }, ++ 'data': { 'username': 'str', 'keys': ['str'], '*reset': 'bool' }, + 'if': 'defined(CONFIG_POSIX)' } + + ## +-- +2.27.0 + diff --git a/kvm-qga-add-ssh-add-remove-authorized-keys.patch b/kvm-qga-add-ssh-add-remove-authorized-keys.patch new file mode 100755 index 0000000..b767d42 --- /dev/null +++ b/kvm-qga-add-ssh-add-remove-authorized-keys.patch @@ -0,0 +1,525 @@ +From 4be6cb23235b29d6ce450c2dacaef09c52d1aeea Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:52 -0400 +Subject: [PATCH 02/14] qga: add ssh-{add, remove}-authorized-keys +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-3-marcandre.lureau@redhat.com> +Patchwork-id: 101688 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/4] qga: add ssh-{add, remove}-authorized-keys +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Marc-André Lureau + +Add new commands to add and remove SSH public keys from +~/.ssh/authorized_keys. + +I took a different approach for testing, including the unit tests right +with the code. I wanted to overwrite the function to get the user +details, I couldn't easily do that over QMP. Furthermore, I prefer +having unit tests very close to the code, and unit files that are domain +specific (commands-posix is too crowded already). FWIW, that +coding/testing style is Rust-style (where tests can or should even be +part of the documentation!). + +Fixes: +https://bugzilla.redhat.com/show_bug.cgi?id=1885332 + +Signed-off-by: Marc-André Lureau +Reviewed-by: Michal Privoznik +Reviewed-by: Daniel P. Berrangé +*squashed in fix-ups for setting file ownership and use of QAPI + conditionals for CONFIG_POSIX instead of stub definitions +*disable qga-ssh-test for now due to G_TEST_OPTION_ISOLATE_DIRS + triggering leak detector in build-oss-fuzz +*fix disallowed g_assert* usage reported by checkpatch +Signed-off-by: Michael Roth + +(cherry picked from commit 8d769ec777dccbff199711aba43aa6297fe4a0e0) +[ Fixes trivial backport conflicts and use Makefile.objs build-sys ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + qga/Makefile.objs | 2 +- + qga/commands-posix-ssh.c | 407 +++++++++++++++++++++++++++++++++++++++ + qga/qapi-schema.json | 35 ++++ + 3 files changed, 443 insertions(+), 1 deletion(-) + create mode 100644 qga/commands-posix-ssh.c + +diff --git a/qga/Makefile.objs b/qga/Makefile.objs +index 80e6bb3c2e..c8da634db0 100644 +--- a/qga/Makefile.objs ++++ b/qga/Makefile.objs +@@ -1,6 +1,6 @@ + commands-posix.o-libs := $(LIBUDEV_LIBS) + qga-obj-y = commands.o guest-agent-command-state.o main.o +-qga-obj-$(CONFIG_POSIX) += commands-posix.o channel-posix.o ++qga-obj-$(CONFIG_POSIX) += commands-posix.o channel-posix.o commands-posix-ssh.o + qga-obj-$(CONFIG_WIN32) += commands-win32.o channel-win32.o service-win32.o + qga-obj-$(CONFIG_WIN32) += vss-win32.o + qga-obj-y += qapi-generated/qga-qapi-types.o qapi-generated/qga-qapi-visit.o +diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c +new file mode 100644 +index 0000000000..f74d89679c +--- /dev/null ++++ b/qga/commands-posix-ssh.c +@@ -0,0 +1,407 @@ ++ /* ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++#include "qemu/osdep.h" ++ ++#include ++#include ++#include ++#include ++ ++#include "qapi/error.h" ++#include "qga-qapi-commands.h" ++ ++#ifdef QGA_BUILD_UNIT_TEST ++static struct passwd * ++test_get_passwd_entry(const gchar *user_name, GError **error) ++{ ++ struct passwd *p; ++ int ret; ++ ++ if (!user_name || g_strcmp0(user_name, g_get_user_name())) { ++ g_set_error(error, G_UNIX_ERROR, 0, "Invalid user name"); ++ return NULL; ++ } ++ ++ p = g_new0(struct passwd, 1); ++ p->pw_dir = (char *)g_get_home_dir(); ++ p->pw_uid = geteuid(); ++ p->pw_gid = getegid(); ++ ++ ret = g_mkdir_with_parents(p->pw_dir, 0700); ++ g_assert(ret == 0); ++ ++ return p; ++} ++ ++#define g_unix_get_passwd_entry_qemu(username, err) \ ++ test_get_passwd_entry(username, err) ++#endif ++ ++static struct passwd * ++get_passwd_entry(const char *username, Error **errp) ++{ ++ g_autoptr(GError) err = NULL; ++ struct passwd *p; ++ ++ ERRP_GUARD(); ++ ++ p = g_unix_get_passwd_entry_qemu(username, &err); ++ if (p == NULL) { ++ error_setg(errp, "failed to lookup user '%s': %s", ++ username, err->message); ++ return NULL; ++ } ++ ++ return p; ++} ++ ++static bool ++mkdir_for_user(const char *path, const struct passwd *p, ++ mode_t mode, Error **errp) ++{ ++ ERRP_GUARD(); ++ ++ if (g_mkdir(path, mode) == -1) { ++ error_setg(errp, "failed to create directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ if (chown(path, p->pw_uid, p->pw_gid) == -1) { ++ error_setg(errp, "failed to set ownership of directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ if (chmod(path, mode) == -1) { ++ error_setg(errp, "failed to set permissions of directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++check_openssh_pub_key(const char *key, Error **errp) ++{ ++ ERRP_GUARD(); ++ ++ /* simple sanity-check, we may want more? */ ++ if (!key || key[0] == '#' || strchr(key, '\n')) { ++ error_setg(errp, "invalid OpenSSH public key: '%s'", key); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++check_openssh_pub_keys(strList *keys, size_t *nkeys, Error **errp) ++{ ++ size_t n = 0; ++ strList *k; ++ ++ ERRP_GUARD(); ++ ++ for (k = keys; k != NULL; k = k->next) { ++ if (!check_openssh_pub_key(k->value, errp)) { ++ return false; ++ } ++ n++; ++ } ++ ++ if (nkeys) { ++ *nkeys = n; ++ } ++ return true; ++} ++ ++static bool ++write_authkeys(const char *path, const GStrv keys, ++ const struct passwd *p, Error **errp) ++{ ++ g_autofree char *contents = NULL; ++ g_autoptr(GError) err = NULL; ++ ++ ERRP_GUARD(); ++ ++ contents = g_strjoinv("\n", keys); ++ if (!g_file_set_contents(path, contents, -1, &err)) { ++ error_setg(errp, "failed to write to '%s': %s", path, err->message); ++ return false; ++ } ++ ++ if (chown(path, p->pw_uid, p->pw_gid) == -1) { ++ error_setg(errp, "failed to set ownership of directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ if (chmod(path, 0600) == -1) { ++ error_setg(errp, "failed to set permissions of '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static GStrv ++read_authkeys(const char *path, Error **errp) ++{ ++ g_autoptr(GError) err = NULL; ++ g_autofree char *contents = NULL; ++ ++ ERRP_GUARD(); ++ ++ if (!g_file_get_contents(path, &contents, NULL, &err)) { ++ error_setg(errp, "failed to read '%s': %s", path, err->message); ++ return NULL; ++ } ++ ++ return g_strsplit(contents, "\n", -1); ++ ++} ++ ++void ++qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, ++ Error **errp) ++{ ++ g_autofree struct passwd *p = NULL; ++ g_autofree char *ssh_path = NULL; ++ g_autofree char *authkeys_path = NULL; ++ g_auto(GStrv) authkeys = NULL; ++ strList *k; ++ size_t nkeys, nauthkeys; ++ ++ ERRP_GUARD(); ++ ++ if (!check_openssh_pub_keys(keys, &nkeys, errp)) { ++ return; ++ } ++ ++ p = get_passwd_entry(username, errp); ++ if (p == NULL) { ++ return; ++ } ++ ++ ssh_path = g_build_filename(p->pw_dir, ".ssh", NULL); ++ authkeys_path = g_build_filename(ssh_path, "authorized_keys", NULL); ++ ++ authkeys = read_authkeys(authkeys_path, NULL); ++ if (authkeys == NULL) { ++ if (!g_file_test(ssh_path, G_FILE_TEST_IS_DIR) && ++ !mkdir_for_user(ssh_path, p, 0700, errp)) { ++ return; ++ } ++ } ++ ++ nauthkeys = authkeys ? g_strv_length(authkeys) : 0; ++ authkeys = g_realloc_n(authkeys, nauthkeys + nkeys + 1, sizeof(char *)); ++ memset(authkeys + nauthkeys, 0, (nkeys + 1) * sizeof(char *)); ++ ++ for (k = keys; k != NULL; k = k->next) { ++ if (g_strv_contains((const gchar * const *)authkeys, k->value)) { ++ continue; ++ } ++ authkeys[nauthkeys++] = g_strdup(k->value); ++ } ++ ++ write_authkeys(authkeys_path, authkeys, p, errp); ++} ++ ++void ++qmp_guest_ssh_remove_authorized_keys(const char *username, strList *keys, ++ Error **errp) ++{ ++ g_autofree struct passwd *p = NULL; ++ g_autofree char *authkeys_path = NULL; ++ g_autofree GStrv new_keys = NULL; /* do not own the strings */ ++ g_auto(GStrv) authkeys = NULL; ++ GStrv a; ++ size_t nkeys = 0; ++ ++ ERRP_GUARD(); ++ ++ if (!check_openssh_pub_keys(keys, NULL, errp)) { ++ return; ++ } ++ ++ p = get_passwd_entry(username, errp); ++ if (p == NULL) { ++ return; ++ } ++ ++ authkeys_path = g_build_filename(p->pw_dir, ".ssh", ++ "authorized_keys", NULL); ++ if (!g_file_test(authkeys_path, G_FILE_TEST_EXISTS)) { ++ return; ++ } ++ authkeys = read_authkeys(authkeys_path, errp); ++ if (authkeys == NULL) { ++ return; ++ } ++ ++ new_keys = g_new0(char *, g_strv_length(authkeys) + 1); ++ for (a = authkeys; *a != NULL; a++) { ++ strList *k; ++ ++ for (k = keys; k != NULL; k = k->next) { ++ if (g_str_equal(k->value, *a)) { ++ break; ++ } ++ } ++ if (k != NULL) { ++ continue; ++ } ++ ++ new_keys[nkeys++] = *a; ++ } ++ ++ write_authkeys(authkeys_path, new_keys, p, errp); ++} ++ ++ ++#ifdef QGA_BUILD_UNIT_TEST ++#if GLIB_CHECK_VERSION(2, 60, 0) ++static const strList test_key2 = { ++ .value = (char *)"algo key2 comments" ++}; ++ ++static const strList test_key1_2 = { ++ .value = (char *)"algo key1 comments", ++ .next = (strList *)&test_key2, ++}; ++ ++static char * ++test_get_authorized_keys_path(void) ++{ ++ return g_build_filename(g_get_home_dir(), ".ssh", "authorized_keys", NULL); ++} ++ ++static void ++test_authorized_keys_set(const char *contents) ++{ ++ g_autoptr(GError) err = NULL; ++ g_autofree char *path = NULL; ++ int ret; ++ ++ path = g_build_filename(g_get_home_dir(), ".ssh", NULL); ++ ret = g_mkdir_with_parents(path, 0700); ++ g_assert(ret == 0); ++ g_free(path); ++ ++ path = test_get_authorized_keys_path(); ++ g_file_set_contents(path, contents, -1, &err); ++ g_assert(err == NULL); ++} ++ ++static void ++test_authorized_keys_equal(const char *expected) ++{ ++ g_autoptr(GError) err = NULL; ++ g_autofree char *path = NULL; ++ g_autofree char *contents = NULL; ++ ++ path = test_get_authorized_keys_path(); ++ g_file_get_contents(path, &contents, NULL, &err); ++ g_assert(err == NULL); ++ ++ g_assert(g_strcmp0(contents, expected) == 0); ++} ++ ++static void ++test_invalid_user(void) ++{ ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys("", NULL, &err); ++ error_free_or_abort(&err); ++ ++ qmp_guest_ssh_remove_authorized_keys("", NULL, &err); ++ error_free_or_abort(&err); ++} ++ ++static void ++test_invalid_key(void) ++{ ++ strList key = { ++ .value = (char *)"not a valid\nkey" ++ }; ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), &key, &err); ++ error_free_or_abort(&err); ++ ++ qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), &key, &err); ++ error_free_or_abort(&err); ++} ++ ++static void ++test_add_keys(void) ++{ ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key2, &err); ++ g_assert(err == NULL); ++ ++ test_authorized_keys_equal("algo key2 comments"); ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key1_2, &err); ++ g_assert(err == NULL); ++ ++ /* key2 came first, and should'nt be duplicated */ ++ test_authorized_keys_equal("algo key2 comments\n" ++ "algo key1 comments"); ++} ++ ++static void ++test_remove_keys(void) ++{ ++ Error *err = NULL; ++ static const char *authkeys = ++ "algo key1 comments\n" ++ /* originally duplicated */ ++ "algo key1 comments\n" ++ "# a commented line\n" ++ "algo some-key another\n"; ++ ++ test_authorized_keys_set(authkeys); ++ qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), ++ (strList *)&test_key2, &err); ++ g_assert(err == NULL); ++ test_authorized_keys_equal(authkeys); ++ ++ qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), ++ (strList *)&test_key1_2, &err); ++ g_assert(err == NULL); ++ test_authorized_keys_equal("# a commented line\n" ++ "algo some-key another\n"); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ setlocale(LC_ALL, ""); ++ ++ g_test_init(&argc, &argv, G_TEST_OPTION_ISOLATE_DIRS, NULL); ++ ++ g_test_add_func("/qga/ssh/invalid_user", test_invalid_user); ++ g_test_add_func("/qga/ssh/invalid_key", test_invalid_key); ++ g_test_add_func("/qga/ssh/add_keys", test_add_keys); ++ g_test_add_func("/qga/ssh/remove_keys", test_remove_keys); ++ ++ return g_test_run(); ++} ++#else ++int main(int argc, char *argv[]) ++{ ++ g_test_message("test skipped, needs glib >= 2.60"); ++ return 0; ++} ++#endif /* GLIB_2_60 */ ++#endif /* BUILD_UNIT_TEST */ +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 4222cb92d3..3b85f5a03f 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1273,3 +1273,38 @@ + ## + { 'command': 'guest-get-osinfo', + 'returns': 'GuestOSInfo' } ++ ++## ++# @guest-ssh-add-authorized-keys: ++# ++# @username: the user account to add the authorized keys ++# @keys: the public keys to add (in OpenSSH/sshd(8) authorized_keys format) ++# ++# Append public keys to user .ssh/authorized_keys on Unix systems (not ++# implemented for other systems). ++# ++# Returns: Nothing on success. ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-ssh-add-authorized-keys', ++ 'data': { 'username': 'str', 'keys': ['str'] }, ++ 'if': 'defined(CONFIG_POSIX)' } ++ ++## ++# @guest-ssh-remove-authorized-keys: ++# ++# @username: the user account to remove the authorized keys ++# @keys: the public keys to remove (in OpenSSH/sshd(8) authorized_keys format) ++# ++# Remove public keys from the user .ssh/authorized_keys on Unix systems (not ++# implemented for other systems). It's not an error if the key is already ++# missing. ++# ++# Returns: Nothing on success. ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-ssh-remove-authorized-keys', ++ 'data': { 'username': 'str', 'keys': ['str'] }, ++ 'if': 'defined(CONFIG_POSIX)' } +-- +2.27.0 + diff --git a/kvm-qga-add-ssh-get-authorized-keys.patch b/kvm-qga-add-ssh-get-authorized-keys.patch new file mode 100755 index 0000000..2b4c377 --- /dev/null +++ b/kvm-qga-add-ssh-get-authorized-keys.patch @@ -0,0 +1,170 @@ +From 1ed102f5489e6cf3168d9014e9a082909193b6fc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:57 -0400 +Subject: [PATCH 04/14] qga: add ssh-get-authorized-keys +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-5-marcandre.lureau@redhat.com> +Patchwork-id: 101690 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 4/4] qga: add ssh-get-authorized-keys +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Marc-André Lureau + +Signed-off-by: Marc-André Lureau +*fix-up merge conflicts due to qga-ssh-test being disabled in earlier + patch due to G_TEST_OPTION_ISOLATE_DIRS triggering build-oss-fuzz + leak detector. +*fix up style and disallowed g_assert* usage reported by checkpatch +Signed-off-by: Michael Roth + +(cherry picked from commit cad97c08a1c17830d77a46780088bc0199df89d1) +[ Fix trivial schema conflict ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + qga/commands-posix-ssh.c | 66 ++++++++++++++++++++++++++++++++++++++++ + qga/qapi-schema.json | 30 ++++++++++++++++++ + 2 files changed, 96 insertions(+) + +diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c +index 362c9e8816..749167e82d 100644 +--- a/qga/commands-posix-ssh.c ++++ b/qga/commands-posix-ssh.c +@@ -268,6 +268,46 @@ qmp_guest_ssh_remove_authorized_keys(const char *username, strList *keys, + write_authkeys(authkeys_path, new_keys, p, errp); + } + ++GuestAuthorizedKeys * ++qmp_guest_ssh_get_authorized_keys(const char *username, Error **errp) ++{ ++ g_autofree struct passwd *p = NULL; ++ g_autofree char *authkeys_path = NULL; ++ g_auto(GStrv) authkeys = NULL; ++ g_autoptr(GuestAuthorizedKeys) ret = NULL; ++ int i; ++ ++ ERRP_GUARD(); ++ ++ p = get_passwd_entry(username, errp); ++ if (p == NULL) { ++ return NULL; ++ } ++ ++ authkeys_path = g_build_filename(p->pw_dir, ".ssh", ++ "authorized_keys", NULL); ++ authkeys = read_authkeys(authkeys_path, errp); ++ if (authkeys == NULL) { ++ return NULL; ++ } ++ ++ ret = g_new0(GuestAuthorizedKeys, 1); ++ for (i = 0; authkeys[i] != NULL; i++) { ++ strList *new; ++ ++ g_strstrip(authkeys[i]); ++ if (!authkeys[i][0] || authkeys[i][0] == '#') { ++ continue; ++ } ++ ++ new = g_new0(strList, 1); ++ new->value = g_strdup(authkeys[i]); ++ new->next = ret->keys; ++ ret->keys = new; ++ } ++ ++ return g_steal_pointer(&ret); ++} + + #ifdef QGA_BUILD_UNIT_TEST + #if GLIB_CHECK_VERSION(2, 60, 0) +@@ -426,6 +466,31 @@ test_remove_keys(void) + "algo some-key another\n"); + } + ++static void ++test_get_keys(void) ++{ ++ Error *err = NULL; ++ static const char *authkeys = ++ "algo key1 comments\n" ++ "# a commented line\n" ++ "algo some-key another\n"; ++ g_autoptr(GuestAuthorizedKeys) ret = NULL; ++ strList *k; ++ size_t len = 0; ++ ++ test_authorized_keys_set(authkeys); ++ ++ ret = qmp_guest_ssh_get_authorized_keys(g_get_user_name(), &err); ++ g_assert(err == NULL); ++ ++ for (len = 0, k = ret->keys; k != NULL; k = k->next) { ++ g_assert(g_str_has_prefix(k->value, "algo ")); ++ len++; ++ } ++ ++ g_assert(len == 2); ++} ++ + int main(int argc, char *argv[]) + { + setlocale(LC_ALL, ""); +@@ -437,6 +502,7 @@ int main(int argc, char *argv[]) + g_test_add_func("/qga/ssh/add_keys", test_add_keys); + g_test_add_func("/qga/ssh/add_reset_keys", test_add_reset_keys); + g_test_add_func("/qga/ssh/remove_keys", test_remove_keys); ++ g_test_add_func("/qga/ssh/get_keys", test_get_keys); + + return g_test_run(); + } +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index a70ea5da77..97bf96712e 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1274,6 +1274,36 @@ + { 'command': 'guest-get-osinfo', + 'returns': 'GuestOSInfo' } + ++## ++# @GuestAuthorizedKeys: ++# ++# @keys: public keys (in OpenSSH/sshd(8) authorized_keys format) ++# ++# Since: 5.2 ++## ++{ 'struct': 'GuestAuthorizedKeys', ++ 'data': { ++ 'keys': ['str'] ++ }, ++ 'if': 'defined(CONFIG_POSIX)' } ++ ++## ++# @guest-ssh-get-authorized-keys: ++# ++# @username: the user account to add the authorized keys ++# ++# Return the public keys from user .ssh/authorized_keys on Unix systems (not ++# implemented for other systems). ++# ++# Returns: @GuestAuthorizedKeys ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-ssh-get-authorized-keys', ++ 'data': { 'username': 'str' }, ++ 'returns': 'GuestAuthorizedKeys', ++ 'if': 'defined(CONFIG_POSIX)' } ++ + ## + # @guest-ssh-add-authorized-keys: + # +-- +2.27.0 + diff --git a/kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch b/kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch new file mode 100755 index 0000000..0aa2440 --- /dev/null +++ b/kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch @@ -0,0 +1,140 @@ +From 3a63e2d29bb2fd92577d42aeb8fa956ae18df22e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 2 Oct 2020 10:17:41 -0400 +Subject: [PATCH 02/14] qga/commands-posix: Move the udev code from the pci to + the generic function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20201002101742.249169-3-thuth@redhat.com> +Patchwork-id: 98526 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/3] qga/commands-posix: Move the udev code from the pci to the generic function +Bugzilla: 1755075 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +The libudev-related code is independent from the other pci-related code +and can be re-used for non-pci devices (like ccw devices on s390x). Thus +move this part to the generic function. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1755075 +Signed-off-by: Thomas Huth +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Michael Roth +(cherry picked from commit 43dadc431bacbc5a5baee7e256288a98a3e95ce3) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 62 +++++++++++++++++++++++--------------------- + 1 file changed, 33 insertions(+), 29 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 99d6b1c8c1..6db76aadd1 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -878,10 +878,6 @@ static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + GuestPCIAddress *pciaddr = disk->pci_controller; + bool has_ata = false, has_host = false, has_tgt = false; + char *p, *q, *driver = NULL; +-#ifdef CONFIG_LIBUDEV +- struct udev *udev = NULL; +- struct udev_device *udevice = NULL; +-#endif + bool ret = false; + + p = strstr(syspath, "/devices/pci"); +@@ -940,26 +936,6 @@ static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + pciaddr->slot = pci[2]; + pciaddr->function = pci[3]; + +-#ifdef CONFIG_LIBUDEV +- udev = udev_new(); +- udevice = udev_device_new_from_syspath(udev, syspath); +- if (udev == NULL || udevice == NULL) { +- g_debug("failed to query udev"); +- } else { +- const char *devnode, *serial; +- devnode = udev_device_get_devnode(udevice); +- if (devnode != NULL) { +- disk->dev = g_strdup(devnode); +- disk->has_dev = true; +- } +- serial = udev_device_get_property_value(udevice, "ID_SERIAL"); +- if (serial != NULL && *serial != 0) { +- disk->serial = g_strdup(serial); +- disk->has_serial = true; +- } +- } +-#endif +- + if (strcmp(driver, "ata_piix") == 0) { + /* a host per ide bus, target*:0::0 */ + if (!has_host || !has_tgt) { +@@ -1021,10 +997,6 @@ static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + + cleanup: + g_free(driver); +-#ifdef CONFIG_LIBUDEV +- udev_unref(udev); +- udev_device_unref(udevice); +-#endif + return ret; + } + +@@ -1037,18 +1009,50 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + GuestPCIAddress *pciaddr; + GuestDiskAddressList *list = NULL; + bool has_hwinf; ++#ifdef CONFIG_LIBUDEV ++ struct udev *udev = NULL; ++ struct udev_device *udevice = NULL; ++#endif + + pciaddr = g_new0(GuestPCIAddress, 1); ++ pciaddr->domain = -1; /* -1 means field is invalid */ ++ pciaddr->bus = -1; ++ pciaddr->slot = -1; ++ pciaddr->function = -1; + + disk = g_new0(GuestDiskAddress, 1); + disk->pci_controller = pciaddr; ++ disk->bus_type = GUEST_DISK_BUS_TYPE_UNKNOWN; + + list = g_new0(GuestDiskAddressList, 1); + list->value = disk; + ++#ifdef CONFIG_LIBUDEV ++ udev = udev_new(); ++ udevice = udev_device_new_from_syspath(udev, syspath); ++ if (udev == NULL || udevice == NULL) { ++ g_debug("failed to query udev"); ++ } else { ++ const char *devnode, *serial; ++ devnode = udev_device_get_devnode(udevice); ++ if (devnode != NULL) { ++ disk->dev = g_strdup(devnode); ++ disk->has_dev = true; ++ } ++ serial = udev_device_get_property_value(udevice, "ID_SERIAL"); ++ if (serial != NULL && *serial != 0) { ++ disk->serial = g_strdup(serial); ++ disk->has_serial = true; ++ } ++ } ++ ++ udev_unref(udev); ++ udev_device_unref(udevice); ++#endif ++ + has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); + +- if (has_hwinf) { ++ if (has_hwinf || disk->has_dev || disk->has_serial) { + list->next = fs->disk; + fs->disk = list; + } else { +-- +2.27.0 + diff --git a/kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch b/kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch new file mode 100755 index 0000000..9915334 --- /dev/null +++ b/kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch @@ -0,0 +1,156 @@ +From 84bc86fdf47729bca77957a04161862ffbedbf2f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 2 Oct 2020 10:17:40 -0400 +Subject: [PATCH 01/14] qga/commands-posix: Rework + build_guest_fsinfo_for_real_device() function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Message-id: <20201002101742.249169-2-thuth@redhat.com> +Patchwork-id: 98527 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/3] qga/commands-posix: Rework build_guest_fsinfo_for_real_device() function +Bugzilla: 1755075 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +We are going to support non-PCI devices soon. For this we need to split +the generic GuestDiskAddress and GuestDiskAddressList memory allocation +and list chaining into a separate function first. + +Signed-off-by: Thomas Huth +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Michael Roth +(cherry picked from commit d9fe4f0fea31f0560dc40d3576bc6c48ad97109f) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 65 ++++++++++++++++++++++++++++---------------- + 1 file changed, 41 insertions(+), 24 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 1c1a165dae..99d6b1c8c1 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -865,28 +865,30 @@ static int build_hosts(char const *syspath, char const *host, bool ata, + return i; + } + +-/* Store disk device info specified by @sysfs into @fs */ +-static void build_guest_fsinfo_for_real_device(char const *syspath, +- GuestFilesystemInfo *fs, +- Error **errp) ++/* ++ * Store disk device info for devices on the PCI bus. ++ * Returns true if information has been stored, or false for failure. ++ */ ++static bool build_guest_fsinfo_for_pci_dev(char const *syspath, ++ GuestDiskAddress *disk, ++ Error **errp) + { + unsigned int pci[4], host, hosts[8], tgt[3]; + int i, nhosts = 0, pcilen; +- GuestDiskAddress *disk; +- GuestPCIAddress *pciaddr; +- GuestDiskAddressList *list = NULL; ++ GuestPCIAddress *pciaddr = disk->pci_controller; + bool has_ata = false, has_host = false, has_tgt = false; + char *p, *q, *driver = NULL; + #ifdef CONFIG_LIBUDEV + struct udev *udev = NULL; + struct udev_device *udevice = NULL; + #endif ++ bool ret = false; + + p = strstr(syspath, "/devices/pci"); + if (!p || sscanf(p + 12, "%*x:%*x/%x:%x:%x.%x%n", + pci, pci + 1, pci + 2, pci + 3, &pcilen) < 4) { + g_debug("only pci device is supported: sysfs path '%s'", syspath); +- return; ++ return false; + } + + p += 12 + pcilen; +@@ -907,7 +909,7 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + } + + g_debug("unsupported driver or sysfs path '%s'", syspath); +- return; ++ return false; + } + + p = strstr(syspath, "/target"); +@@ -933,18 +935,11 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + } + } + +- pciaddr = g_malloc0(sizeof(*pciaddr)); + pciaddr->domain = pci[0]; + pciaddr->bus = pci[1]; + pciaddr->slot = pci[2]; + pciaddr->function = pci[3]; + +- disk = g_malloc0(sizeof(*disk)); +- disk->pci_controller = pciaddr; +- +- list = g_malloc0(sizeof(*list)); +- list->value = disk; +- + #ifdef CONFIG_LIBUDEV + udev = udev_new(); + udevice = udev_device_new_from_syspath(udev, syspath); +@@ -1022,21 +1017,43 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + goto cleanup; + } + +- list->next = fs->disk; +- fs->disk = list; +- goto out; ++ ret = true; + + cleanup: +- if (list) { +- qapi_free_GuestDiskAddressList(list); +- } +-out: + g_free(driver); + #ifdef CONFIG_LIBUDEV + udev_unref(udev); + udev_device_unref(udevice); + #endif +- return; ++ return ret; ++} ++ ++/* Store disk device info specified by @sysfs into @fs */ ++static void build_guest_fsinfo_for_real_device(char const *syspath, ++ GuestFilesystemInfo *fs, ++ Error **errp) ++{ ++ GuestDiskAddress *disk; ++ GuestPCIAddress *pciaddr; ++ GuestDiskAddressList *list = NULL; ++ bool has_hwinf; ++ ++ pciaddr = g_new0(GuestPCIAddress, 1); ++ ++ disk = g_new0(GuestDiskAddress, 1); ++ disk->pci_controller = pciaddr; ++ ++ list = g_new0(GuestDiskAddressList, 1); ++ list->value = disk; ++ ++ has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); ++ ++ if (has_hwinf) { ++ list->next = fs->disk; ++ fs->disk = list; ++ } else { ++ qapi_free_GuestDiskAddressList(list); ++ } + } + + static void build_guest_fsinfo_for_device(char const *devpath, +-- +2.27.0 + diff --git a/kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch b/kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch new file mode 100755 index 0000000..0d37a64 --- /dev/null +++ b/kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch @@ -0,0 +1,94 @@ +From 250227a53c1d43d2bd8346922edb3452f3534be6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 2 Oct 2020 10:17:42 -0400 +Subject: [PATCH 03/14] qga/commands-posix: Support fsinfo for non-PCI virtio + devices, too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20201002101742.249169-4-thuth@redhat.com> +Patchwork-id: 98528 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/3] qga/commands-posix: Support fsinfo for non-PCI virtio devices, too +Bugzilla: 1755075 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +QEMU on s390x uses virtio via channel I/O instead of PCI by default. +Add a function to detect and provide information for virtio-scsi and +virtio-block devices here, too. + +Signed-off-by: Thomas Huth +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Michael Roth +(cherry picked from commit 23843c129d5e1ca360605e511a43a34faebb47c4) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 42 +++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 41 insertions(+), 1 deletion(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 6db76aadd1..c86c87ed52 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1000,6 +1000,39 @@ cleanup: + return ret; + } + ++/* ++ * Store disk device info for non-PCI virtio devices (for example s390x ++ * channel I/O devices). Returns true if information has been stored, or ++ * false for failure. ++ */ ++static bool build_guest_fsinfo_for_nonpci_virtio(char const *syspath, ++ GuestDiskAddress *disk, ++ Error **errp) ++{ ++ unsigned int tgt[3]; ++ char *p; ++ ++ if (!strstr(syspath, "/virtio") || !strstr(syspath, "/block")) { ++ g_debug("Unsupported virtio device '%s'", syspath); ++ return false; ++ } ++ ++ p = strstr(syspath, "/target"); ++ if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u", ++ &tgt[0], &tgt[1], &tgt[2]) == 3) { ++ /* virtio-scsi: target*:0:: */ ++ disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; ++ disk->bus = tgt[0]; ++ disk->target = tgt[1]; ++ disk->unit = tgt[2]; ++ } else { ++ /* virtio-blk: 1 disk per 1 device */ ++ disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO; ++ } ++ ++ return true; ++} ++ + /* Store disk device info specified by @sysfs into @fs */ + static void build_guest_fsinfo_for_real_device(char const *syspath, + GuestFilesystemInfo *fs, +@@ -1050,7 +1083,14 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + udev_device_unref(udevice); + #endif + +- has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); ++ if (strstr(syspath, "/devices/pci")) { ++ has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); ++ } else if (strstr(syspath, "/virtio")) { ++ has_hwinf = build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp); ++ } else { ++ g_debug("Unsupported device type for '%s'", syspath); ++ has_hwinf = false; ++ } + + if (has_hwinf || disk->has_dev || disk->has_serial) { + list->next = fs->disk; +-- +2.27.0 + diff --git a/kvm-qga-fix-assert-regression-on-guest-shutdown.patch b/kvm-qga-fix-assert-regression-on-guest-shutdown.patch new file mode 100755 index 0000000..7db6e1f --- /dev/null +++ b/kvm-qga-fix-assert-regression-on-guest-shutdown.patch @@ -0,0 +1,61 @@ +From 93b37bad75d14ed4b9e96cc3587d8ae16cb96ba3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 2 Oct 2020 17:46:08 -0400 +Subject: [PATCH 01/18] qga: fix assert regression on guest-shutdown +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201002174608.943992-2-marcandre.lureau@redhat.com> +Patchwork-id: 98534 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] qga: fix assert regression on guest-shutdown +Bugzilla: 1884531 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Marc-André Lureau + +Since commit 781f2b3d1e ("qga: process_event() simplification"), +send_response() is called unconditionally, but will assert when "rsp" is +NULL. This may happen with QCO_NO_SUCCESS_RESP commands, such as +"guest-shutdown". + +Fixes: 781f2b3d1e5ef389b44016a897fd55e7a780bf35 +Cc: Michael Roth +Reported-by: Christian Ehrhardt +Signed-off-by: Marc-André Lureau +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Christian Ehrhardt +Tested-by: Christian Ehrhardt +Cc: qemu-stable@nongnu.org +Signed-off-by: Michael Roth + +(cherry picked from commit 844bd70b5652f30bbace89499f513e3fbbb6457a) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/main.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/qga/main.c b/qga/main.c +index c35c2a21209..12fa463f4cd 100644 +--- a/qga/main.c ++++ b/qga/main.c +@@ -529,7 +529,11 @@ static int send_response(GAState *s, const QDict *rsp) + QString *payload_qstr, *response_qstr; + GIOStatus status; + +- g_assert(rsp && s->channel); ++ g_assert(s->channel); ++ ++ if (!rsp) { ++ return 0; ++ } + + payload_qstr = qobject_to_json(QOBJECT(rsp)); + if (!payload_qstr) { +-- +2.27.0 + diff --git a/kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch b/kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch new file mode 100755 index 0000000..6ffc5bd --- /dev/null +++ b/kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch @@ -0,0 +1,54 @@ +From c9b1eb9d6c0da9098d5410d90d290d6fca6ea7dc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:14 -0500 +Subject: [PATCH 13/14] qga: fix missing closedir() in qmp_guest_get_disks() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-10-marcandre.lureau@redhat.com> +Patchwork-id: 100481 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 09/10] qga: fix missing closedir() in qmp_guest_get_disks() +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Michael Roth + +We opendir("/sys/block") at the beginning of the function, but we never +close it prior to returning. + +Fixes: Coverity CID 1436130 +Fixes: fed3956429d5 ("qga: add implementation of guest-get-disks for Linux") +Reported-by: Peter Maydell +Cc: Marc-André Lureau +Cc: Tomáš Golembiovský +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Michael Roth + +(cherry-picked from commit b1b9ab1c04d560f86d8da3dfca4d8b21de75fee6) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 96f5ddafd3a..9a170dee14c 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1445,6 +1445,9 @@ GuestDiskInfoList *qmp_guest_get_disks(Error **errp) + get_disk_deps(disk_dir, disk); + ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name); + } ++ ++ closedir(dp); ++ + return ret; + } + +-- +2.27.0 + diff --git a/kvm-qga-rename-Error-parameter-to-more-common-errp.patch b/kvm-qga-rename-Error-parameter-to-more-common-errp.patch new file mode 100755 index 0000000..2528d26 --- /dev/null +++ b/kvm-qga-rename-Error-parameter-to-more-common-errp.patch @@ -0,0 +1,121 @@ +From 457ba062cc1026a88a70ab3cb9a52acd62c5a2a8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 24 Dec 2020 12:53:02 -0500 +Subject: [PATCH 2/5] qga: rename Error ** parameter to more common errp +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201224125304.62697-2-marcandre.lureau@redhat.com> +Patchwork-id: 100498 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/3] qga: rename Error ** parameter to more common errp +Bugzilla: 1910326 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Vladimir Sementsov-Ogievskiy + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20191205174635.18758-13-vsementsov@virtuozzo.com> +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Markus Armbruster +Signed-off-by: Markus Armbruster + +(cherry picked from commit b90abbac0b95f68a7ebac5545ab77b98f598a9c7) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 2 +- + qga/commands-win32.c | 2 +- + qga/commands.c | 12 ++++++------ + 3 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index c02373cdf7d..29353e90c8f 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -3134,7 +3134,7 @@ static double ga_get_login_time(struct utmpx *user_info) + return seconds + useconds; + } + +-GuestUserList *qmp_guest_get_users(Error **err) ++GuestUserList *qmp_guest_get_users(Error **errp) + { + GHashTable *cache = NULL; + GuestUserList *head = NULL, *cur_item = NULL; +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index a07725e874b..618ccdfadaa 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -2047,7 +2047,7 @@ typedef struct _GA_WTSINFOA { + + } GA_WTSINFOA; + +-GuestUserList *qmp_guest_get_users(Error **err) ++GuestUserList *qmp_guest_get_users(Error **errp) + { + #define QGA_NANOSECONDS 10000000 + +diff --git a/qga/commands.c b/qga/commands.c +index 0c7d1385c23..43c323ceada 100644 +--- a/qga/commands.c ++++ b/qga/commands.c +@@ -143,7 +143,7 @@ static GuestExecInfo *guest_exec_info_find(int64_t pid_numeric) + return NULL; + } + +-GuestExecStatus *qmp_guest_exec_status(int64_t pid, Error **err) ++GuestExecStatus *qmp_guest_exec_status(int64_t pid, Error **errp) + { + GuestExecInfo *gei; + GuestExecStatus *ges; +@@ -152,7 +152,7 @@ GuestExecStatus *qmp_guest_exec_status(int64_t pid, Error **err) + + gei = guest_exec_info_find(pid); + if (gei == NULL) { +- error_setg(err, QERR_INVALID_PARAMETER, "pid"); ++ error_setg(errp, QERR_INVALID_PARAMETER, "pid"); + return NULL; + } + +@@ -385,7 +385,7 @@ GuestExec *qmp_guest_exec(const char *path, + bool has_env, strList *env, + bool has_input_data, const char *input_data, + bool has_capture_output, bool capture_output, +- Error **err) ++ Error **errp) + { + GPid pid; + GuestExec *ge = NULL; +@@ -405,7 +405,7 @@ GuestExec *qmp_guest_exec(const char *path, + arglist.next = has_arg ? arg : NULL; + + if (has_input_data) { +- input = qbase64_decode(input_data, -1, &ninput, err); ++ input = qbase64_decode(input_data, -1, &ninput, errp); + if (!input) { + return NULL; + } +@@ -424,7 +424,7 @@ GuestExec *qmp_guest_exec(const char *path, + guest_exec_task_setup, NULL, &pid, has_input_data ? &in_fd : NULL, + has_output ? &out_fd : NULL, has_output ? &err_fd : NULL, &gerr); + if (!ret) { +- error_setg(err, QERR_QGA_COMMAND_FAILED, gerr->message); ++ error_setg(errp, QERR_QGA_COMMAND_FAILED, gerr->message); + g_error_free(gerr); + goto done; + } +@@ -499,7 +499,7 @@ int ga_parse_whence(GuestFileWhence *whence, Error **errp) + return -1; + } + +-GuestHostName *qmp_guest_get_host_name(Error **err) ++GuestHostName *qmp_guest_get_host_name(Error **errp) + { + GuestHostName *result = NULL; + gchar const *hostname = g_get_host_name(); +-- +2.27.0 + diff --git a/kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch b/kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch new file mode 100755 index 0000000..727015e --- /dev/null +++ b/kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch @@ -0,0 +1,113 @@ +From ff881d64d3f29825ab093eb2be183658226ccba3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:15 -0500 +Subject: [PATCH 14/14] qga: update schema for guest-get-disks 'dependents' + field +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-11-marcandre.lureau@redhat.com> +Patchwork-id: 100480 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 10/10] qga: update schema for guest-get-disks 'dependents' field +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Michael Roth + +The recently-added 'guest-get-disk' command returns a list of +GuestDiskInfo entries, which in turn have a 'dependents' field which +lists devices these entries are dependent upon. Thus, 'dependencies' +is a better name for this field. Address this by renaming the field +accordingly. + +Additionally, 'dependents' is specified as non-optional, even though +it's not implemented for w32. This is misleading, since it gives users +the impression that a particular disk might not have dependencies, +when in reality that information is simply not known to the guest +agent. Address this by making 'dependents' an optional field, and only +marking it as in-use when the facilities to obtain this information are +available to the guest agent. + +Cc: Eric Blake +Cc: Tomáš Golembiovský +Cc: Marc-André Lureau +Reviewed-by: Eric Blake +Reviewed-by: Marc-André Lureau +Signed-off-by: Michael Roth + +(cherry-picked from commit a8aa94b5f8427cc2924d8cdd417c8014db1c86c0) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 10 ++++++---- + qga/qapi-schema.json | 8 ++++---- + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 9a170dee14c..c02373cdf7d 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1287,6 +1287,7 @@ static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) + g_debug("failed to list entries in %s", deps_dir); + return; + } ++ disk->has_dependencies = true; + while ((dep = g_dir_read_name(dp_deps)) != NULL) { + g_autofree char *dep_dir = NULL; + strList *dep_item = NULL; +@@ -1299,8 +1300,8 @@ static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) + g_debug(" adding dependent device: %s", dev_name); + dep_item = g_new0(strList, 1); + dep_item->value = dev_name; +- dep_item->next = disk->dependents; +- disk->dependents = dep_item; ++ dep_item->next = disk->dependencies; ++ disk->dependencies = dep_item; + } + } + g_dir_close(dp_deps); +@@ -1353,8 +1354,9 @@ static GuestDiskInfoList *get_disk_partitions( + partition->name = dev_name; + partition->partition = true; + /* Add parent disk as dependent for easier tracking of hierarchy */ +- partition->dependents = g_new0(strList, 1); +- partition->dependents->value = g_strdup(disk_dev); ++ partition->dependencies = g_new0(strList, 1); ++ partition->dependencies->value = g_strdup(disk_dev); ++ partition->has_dependencies = true; + + item = g_new0(GuestDiskInfoList, 1); + item->value = partition; +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 22df375c92f..4222cb92d34 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -857,9 +857,9 @@ + # + # @name: device node (Linux) or device UNC (Windows) + # @partition: whether this is a partition or disk +-# @dependents: list of dependent devices; e.g. for LVs of the LVM this will +-# hold the list of PVs, for LUKS encrypted volume this will +-# contain the disk where the volume is placed. (Linux) ++# @dependencies: list of device dependencies; e.g. for LVs of the LVM this will ++# hold the list of PVs, for LUKS encrypted volume this will ++# contain the disk where the volume is placed. (Linux) + # @address: disk address information (only for non-virtual devices) + # @alias: optional alias assigned to the disk, on Linux this is a name assigned + # by device mapper +@@ -867,7 +867,7 @@ + # Since 5.2 + ## + { 'struct': 'GuestDiskInfo', +- 'data': {'name': 'str', 'partition': 'bool', 'dependents': ['str'], ++ 'data': {'name': 'str', 'partition': 'bool', '*dependencies': ['str'], + '*address': 'GuestDiskAddress', '*alias': 'str'} } + + ## +-- +2.27.0 + diff --git a/kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch b/kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch new file mode 100755 index 0000000..5384b51 --- /dev/null +++ b/kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch @@ -0,0 +1,55 @@ +From 5d590d354e42515ea074bf2110a2ab236dbabba1 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:34 +0100 +Subject: [PATCH 06/17] raw-format: Support BDRV_REQ_ZERO_WRITE for truncate + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-6-kwolf@redhat.com> +Patchwork-id: 97447 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 05/11] raw-format: Support BDRV_REQ_ZERO_WRITE for truncate +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +The raw format driver can simply forward the flag and let its bs->file +child take care of actually providing the zeros. + +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200424125448.63318-6-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 1ddaabaecb7eaeb6d8948a32340af95db44c54a1) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/raw-format.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/block/raw-format.c b/block/raw-format.c +index c3acf9a..bdec466 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -387,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + + s->size = offset; + offset += s->offset; +- return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp); ++ return bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + } + + static void raw_eject(BlockDriverState *bs, bool eject_flag) +@@ -445,6 +445,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & + bs->file->bs->supported_zero_flags); ++ bs->supported_truncate_flags = bs->file->bs->supported_truncate_flags & ++ BDRV_REQ_ZERO_WRITE; + + if (bs->probed && !bdrv_is_read_only(bs)) { + bdrv_refresh_filename(bs->file->bs); +-- +1.8.3.1 + diff --git a/kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch b/kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch new file mode 100755 index 0000000..55be349 --- /dev/null +++ b/kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch @@ -0,0 +1,72 @@ +From b07219611480dd4a37b2476604a1cec35c812216 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 23 Dec 2020 12:29:24 -0500 +Subject: [PATCH 1/5] redhat: link /etc/qemu-ga/fsfreeze-hook to /etc/qemu-kvm/ +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201223122924.341944-1-marcandre.lureau@redhat.com> +Patchwork-id: 100496 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH] redhat: link /etc/qemu-ga/fsfreeze-hook to /etc/qemu-kvm/ +Bugzilla: 1910267 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Danilo de Paula + +From: Danilo de Paula + +BZ: 1910267 +BRANCH: rhel-8.4.0 +UPSTREAM: RHEL-only +BREW: 33929331 + +When qemu-ga was introduced to RHEL-8, we used the qemu-guest-agent +from RHEL-7 as base. + +In RHEL-7, qemu-guest-agent is built as standalone package. +It's built as "qemu-ga", hence the "qemu-ga" folders. + +For RHEL-8, that should have been renamed to qemu-kvm, but I missed it. +Renaming those folders to /etc/qemu-kvm is a no go today, because +users might have populated the /etc/qemu-ga/fsfreeze-hook.d folder. + +So, in order to make qemu-ga -F works in RHEL-8, a link is being +created in the expected place, pointing to the real one. + +Also, fsfreeze-hook opens up the fsfreeze-hook.d on the same PATH where +it is stored. However, it doesn't follow symlinks. In order to fix this, +I had to change it to make sure it follows the link. + +An option would be to also link the fsfreeze-hook.d folder, but I choose +not to do so as it creates a permanent/visible change in users +environments. The downside is to keep another downstream-only change. + +Signed-off-by: Danilo C. L. de Paula + +[ cherry-picked from commit 020501879841afb788087f0455df79367c0337a0 ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + redhat/qemu-kvm.spec.template | 6 ++++++ + scripts/qemu-guest-agent/fsfreeze-hook | 2 +- + 2 files changed, 7 insertions(+), 1 deletion(-) + + +diff --git a/scripts/qemu-guest-agent/fsfreeze-hook b/scripts/qemu-guest-agent/fsfreeze-hook +index 13aafd48451..e9b84ec0284 100755 +--- a/scripts/qemu-guest-agent/fsfreeze-hook ++++ b/scripts/qemu-guest-agent/fsfreeze-hook +@@ -8,7 +8,7 @@ + # request, it is issued with "thaw" argument after filesystem is thawed. + + LOGFILE=/var/log/qga-fsfreeze-hook.log +-FSFREEZE_D=$(dirname -- "$0")/fsfreeze-hook.d ++FSFREEZE_D=$(dirname -- "$(realpath $0)")/fsfreeze-hook.d + + # Check whether file $1 is a backup or rpm-generated file and should be ignored + is_ignored_file() { +-- +2.27.0 + diff --git a/kvm-replication-assert-we-own-context-before-job_cancel_.patch b/kvm-replication-assert-we-own-context-before-job_cancel_.patch new file mode 100755 index 0000000..09ef4de --- /dev/null +++ b/kvm-replication-assert-we-own-context-before-job_cancel_.patch @@ -0,0 +1,57 @@ +From 46887feac666d0d7633ff3f5af5721fe2a80a8ab Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:13 +0100 +Subject: [PATCH 2/6] replication: assert we own context before job_cancel_sync + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-3-kwolf@redhat.com> +Patchwork-id: 94595 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/6] replication: assert we own context before job_cancel_sync +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +From: Stefan Reiter + +job_cancel_sync requires the job's lock to be held, all other callers +already do this (replication_stop, drive_backup_abort, +blockdev_backup_abort, job_cancel_sync_all, cancel_common). + +In this case we're in a BlockDriver handler, so we already have a lock, +just assert that it is the same as the one used for the commit_job. + +Signed-off-by: Stefan Reiter +Message-Id: <20200407115651.69472-3-s.reiter@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 08558e33257ec796594bd411261028a93414a70c) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/replication.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/block/replication.c b/block/replication.c +index 99532ce..0ce27ee 100644 +--- a/block/replication.c ++++ b/block/replication.c +@@ -144,12 +144,15 @@ fail: + static void replication_close(BlockDriverState *bs) + { + BDRVReplicationState *s = bs->opaque; ++ Job *commit_job; + + if (s->stage == BLOCK_REPLICATION_RUNNING) { + replication_stop(s->rs, false, NULL); + } + if (s->stage == BLOCK_REPLICATION_FAILOVER) { +- job_cancel_sync(&s->commit_job->job); ++ commit_job = &s->commit_job->job; ++ assert(commit_job->aio_context == qemu_get_current_aio_context()); ++ job_cancel_sync(commit_job); + } + + if (s->mode == REPLICATION_MODE_SECONDARY) { +-- +1.8.3.1 + diff --git a/kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch b/kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch new file mode 100755 index 0000000..917e3ff --- /dev/null +++ b/kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch @@ -0,0 +1,54 @@ +From 4079c4e96f910fe7e57af13feb433f06246f1d79 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:44 -0400 +Subject: [PATCH 6/9] rtl8139: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-7-jmaloy@redhat.com> +Patchwork-id: 101792 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 6/9] rtl8139: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Buglink: https://bugs.launchpad.net/qemu/+bug/1910826 +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 5311fb805a4403bba024e83886fa0e7572265de4) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/rtl8139.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c +index 21d80e96cf..ccb04faa4c 100644 +--- a/hw/net/rtl8139.c ++++ b/hw/net/rtl8139.c +@@ -1793,7 +1793,7 @@ static void rtl8139_transfer_frame(RTL8139State *s, uint8_t *buf, int size, + } + + DPRINTF("+++ transmit loopback mode\n"); +- rtl8139_do_receive(qemu_get_queue(s->nic), buf, size, do_interrupt); ++ qemu_receive_packet(qemu_get_queue(s->nic), buf, size); + + if (iov) { + g_free(buf2); +-- +2.27.0 + diff --git a/kvm-s390-guest-support-for-diagnose-0x318.patch b/kvm-s390-guest-support-for-diagnose-0x318.patch new file mode 100755 index 0000000..84fc7bc --- /dev/null +++ b/kvm-s390-guest-support-for-diagnose-0x318.patch @@ -0,0 +1,282 @@ +From 7ad1c4aaea6cd202449c05fc0034af6b108def4f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:14 -0500 +Subject: [PATCH 14/18] s390: guest support for diagnose 0x318 + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-11-thuth@redhat.com> +Patchwork-id: 99507 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 10/12] s390: guest support for diagnose 0x318 +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +DIAGNOSE 0x318 (diag318) is an s390 instruction that allows the storage +of diagnostic information that is collected by the firmware in the case +of hardware/firmware service events. + +QEMU handles the instruction by storing the info in the CPU state. A +subsequent register sync will communicate the data to the hypervisor. + +QEMU handles the migration via a VM State Description. + +This feature depends on the Extended-Length SCCB (els) feature. If +els is not present, then a warning will be printed and the SCLP bit +that allows the Linux kernel to execute the instruction will not be +set. + +Availability of this instruction is determined by byte 134 (aka fac134) +bit 0 of the SCLP Read Info block. This coincidentally expands into the +space used for CPU entries, which means VMs running with the diag318 +capability may not be able to read information regarding all CPUs +unless the guest kernel supports an extended-length SCCB. + +This feature is not supported in protected virtualization mode. + +Signed-off-by: Collin Walling +Acked-by: Janosch Frank +Acked-by: Thomas Huth +Acked-by: David Hildenbrand +Acked-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-9-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit fabdada9357b9cfd980c7744ddce47e34600bbef) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 5 ++++ + include/hw/s390x/sclp.h | 8 ++++++ + target/s390x/cpu.h | 2 ++ + target/s390x/cpu_features.h | 1 + + target/s390x/cpu_features_def.inc.h | 3 +++ + target/s390x/cpu_models.c | 1 + + target/s390x/gen-features.c | 1 + + target/s390x/kvm.c | 39 +++++++++++++++++++++++++++++ + target/s390x/machine.c | 17 +++++++++++++ + 9 files changed, 77 insertions(+) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 8d111628e04..2931046f456 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -139,6 +139,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + s390_get_feat_block(S390_FEAT_TYPE_SCLP_CONF_CHAR_EXT, + read_info->conf_char_ext); + ++ if (s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB)) { ++ s390_get_feat_block(S390_FEAT_TYPE_SCLP_FAC134, ++ &read_info->fac134); ++ } ++ + read_info->facilities = cpu_to_be64(SCLP_HAS_CPU_INFO | + SCLP_HAS_IOA_RECONFIG); + +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index 62e2aa1d9f1..addd904e5f4 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -133,7 +133,15 @@ typedef struct ReadInfo { + uint16_t highest_cpu; + uint8_t _reserved5[124 - 122]; /* 122-123 */ + uint32_t hmfai; ++ uint8_t _reserved7[134 - 128]; /* 128-133 */ ++ uint8_t fac134; ++ uint8_t _reserved8[144 - 135]; /* 135-143 */ + struct CPUEntry entries[]; ++ /* ++ * When the Extended-Length SCCB (ELS) feature is enabled the ++ * start of the entries field begins at an offset denoted by the ++ * offset_cpu field, otherwise it's at an offset of 128. ++ */ + } QEMU_PACKED ReadInfo; + + typedef struct ReadCpuInfo { +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index a48e655c4d4..1dc21cd311d 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -117,6 +117,8 @@ struct CPUS390XState { + uint16_t external_call_addr; + DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); + ++ uint64_t diag318_info; ++ + /* Fields up to this point are cleared by a CPU reset */ + struct {} end_reset_fields; + +diff --git a/target/s390x/cpu_features.h b/target/s390x/cpu_features.h +index da695a8346e..f74f7fc3a11 100644 +--- a/target/s390x/cpu_features.h ++++ b/target/s390x/cpu_features.h +@@ -23,6 +23,7 @@ typedef enum { + S390_FEAT_TYPE_STFL, + S390_FEAT_TYPE_SCLP_CONF_CHAR, + S390_FEAT_TYPE_SCLP_CONF_CHAR_EXT, ++ S390_FEAT_TYPE_SCLP_FAC134, + S390_FEAT_TYPE_SCLP_CPU, + S390_FEAT_TYPE_MISC, + S390_FEAT_TYPE_PLO, +diff --git a/target/s390x/cpu_features_def.inc.h b/target/s390x/cpu_features_def.inc.h +index 3548d65a69a..cf7e04ee44f 100644 +--- a/target/s390x/cpu_features_def.inc.h ++++ b/target/s390x/cpu_features_def.inc.h +@@ -122,6 +122,9 @@ DEF_FEAT(SIE_CMMA, "cmma", SCLP_CONF_CHAR_EXT, 1, "SIE: Collaborative-memory-man + DEF_FEAT(SIE_PFMFI, "pfmfi", SCLP_CONF_CHAR_EXT, 9, "SIE: PFMF interpretation facility") + DEF_FEAT(SIE_IBS, "ibs", SCLP_CONF_CHAR_EXT, 10, "SIE: Interlock-and-broadcast-suppression facility") + ++/* Features exposed via SCLP SCCB Facilities byte 134 (bit numbers relative to byte-134) */ ++DEF_FEAT(DIAG_318, "diag318", SCLP_FAC134, 0, "Control program name and version codes") ++ + /* Features exposed via SCLP CPU info. */ + DEF_FEAT(SIE_F2, "sief2", SCLP_CPU, 4, "SIE: interception format 2 (Virtual SIE)") + DEF_FEAT(SIE_SKEY, "skey", SCLP_CPU, 5, "SIE: Storage-key facility") +diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c +index be718220d79..bf6a3faba9e 100644 +--- a/target/s390x/cpu_models.c ++++ b/target/s390x/cpu_models.c +@@ -823,6 +823,7 @@ static void check_consistency(const S390CPUModel *model) + { S390_FEAT_PTFF_STOE, S390_FEAT_MULTIPLE_EPOCH }, + { S390_FEAT_PTFF_STOUE, S390_FEAT_MULTIPLE_EPOCH }, + { S390_FEAT_AP_QUEUE_INTERRUPT_CONTROL, S390_FEAT_AP }, ++ { S390_FEAT_DIAG_318, S390_FEAT_EXTENDED_LENGTH_SCCB }, + }; + int i; + +diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c +index 6857f657fba..a1f0a6f3c6f 100644 +--- a/target/s390x/gen-features.c ++++ b/target/s390x/gen-features.c +@@ -523,6 +523,7 @@ static uint16_t full_GEN12_GA1[] = { + S390_FEAT_AP_FACILITIES_TEST, + S390_FEAT_AP, + S390_FEAT_EXTENDED_LENGTH_SCCB, ++ S390_FEAT_DIAG_318, + }; + + static uint16_t full_GEN12_GA2[] = { +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index ef437acb5c1..e5e190d21c9 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -105,6 +105,7 @@ + + #define DIAG_TIMEREVENT 0x288 + #define DIAG_IPL 0x308 ++#define DIAG_SET_CONTROL_PROGRAM_CODES 0x318 + #define DIAG_KVM_HYPERCALL 0x500 + #define DIAG_KVM_BREAKPOINT 0x501 + +@@ -602,6 +603,11 @@ int kvm_arch_put_registers(CPUState *cs, int level) + cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_ETOKEN; + } + ++ if (can_sync_regs(cs, KVM_SYNC_DIAG318)) { ++ cs->kvm_run->s.regs.diag318 = env->diag318_info; ++ cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ } ++ + /* Finally the prefix */ + if (can_sync_regs(cs, KVM_SYNC_PREFIX)) { + cs->kvm_run->s.regs.prefix = env->psa; +@@ -741,6 +747,10 @@ int kvm_arch_get_registers(CPUState *cs) + } + } + ++ if (can_sync_regs(cs, KVM_SYNC_DIAG318)) { ++ env->diag318_info = cs->kvm_run->s.regs.diag318; ++ } ++ + return 0; + } + +@@ -1601,6 +1611,27 @@ static int handle_sw_breakpoint(S390CPU *cpu, struct kvm_run *run) + return -ENOENT; + } + ++static void handle_diag_318(S390CPU *cpu, struct kvm_run *run) ++{ ++ uint64_t reg = (run->s390_sieic.ipa & 0x00f0) >> 4; ++ uint64_t diag318_info = run->s.regs.gprs[reg]; ++ ++ /* ++ * DIAG 318 can only be enabled with KVM support. As such, let's ++ * ensure a guest cannot execute this instruction erroneously. ++ */ ++ if (!s390_has_feat(S390_FEAT_DIAG_318)) { ++ kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION); ++ } ++ ++ cpu->env.diag318_info = diag318_info; ++ ++ if (can_sync_regs(CPU(cpu), KVM_SYNC_DIAG318)) { ++ run->s.regs.diag318 = diag318_info; ++ run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ } ++} ++ + #define DIAG_KVM_CODE_MASK 0x000000000000ffff + + static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb) +@@ -1620,6 +1651,9 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb) + case DIAG_IPL: + kvm_handle_diag_308(cpu, run); + break; ++ case DIAG_SET_CONTROL_PROGRAM_CODES: ++ handle_diag_318(cpu, run); ++ break; + case DIAG_KVM_HYPERCALL: + r = handle_hypercall(cpu, run); + break; +@@ -2449,6 +2483,11 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + */ + set_bit(S390_FEAT_EXTENDED_LENGTH_SCCB, model->features); + ++ /* DIAGNOSE 0x318 is not supported under protected virtualization */ ++ if (!s390_is_pv() && kvm_check_extension(kvm_state, KVM_CAP_S390_DIAG318)) { ++ set_bit(S390_FEAT_DIAG_318, model->features); ++ } ++ + /* strip of features that are not part of the maximum model */ + bitmap_and(model->features, model->features, model->def->full_feat, + S390_FEAT_MAX); +diff --git a/target/s390x/machine.c b/target/s390x/machine.c +index 549bb6c2808..5b4e82f1ab9 100644 +--- a/target/s390x/machine.c ++++ b/target/s390x/machine.c +@@ -234,6 +234,22 @@ const VMStateDescription vmstate_etoken = { + } + }; + ++static bool diag318_needed(void *opaque) ++{ ++ return s390_has_feat(S390_FEAT_DIAG_318); ++} ++ ++const VMStateDescription vmstate_diag318 = { ++ .name = "cpu/diag318", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = diag318_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT64(env.diag318_info, S390CPU), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ + const VMStateDescription vmstate_s390_cpu = { + .name = "cpu", + .post_load = cpu_post_load, +@@ -270,6 +286,7 @@ const VMStateDescription vmstate_s390_cpu = { + &vmstate_gscb, + &vmstate_bpbc, + &vmstate_etoken, ++ &vmstate_diag318, + NULL + }, + }; +-- +2.27.0 + diff --git a/kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch b/kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch new file mode 100755 index 0000000..c45158a --- /dev/null +++ b/kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch @@ -0,0 +1,54 @@ +From 1769600e1e3bd5ca48450de8ce8a118bf0af96f3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:00 -0400 +Subject: [PATCH 18/42] s390/ipl: fix off-by-one in + update_machine_ipl_properties() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-19-thuth@redhat.com> +Patchwork-id: 97028 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 18/38] s390/ipl: fix off-by-one in update_machine_ipl_properties() +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Halil Pasic + +In update_machine_ipl_properties() the array ascii_loadparm needs to +hold the 8 char loadparm and a string terminating zero char. + +Let's increase the size of ascii_loadparm accordingly. + +Signed-off-by: Halil Pasic +Fixes: 0a01e082a428 ("s390/ipl: sync back loadparm") +Fixes: Coverity CID 1421966 +Reported-by: Peter Maydell +Message-Id: <20200320143101.41764-1-pasic@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 7722837369eb1c7e808021d79da68afa0c01c26f) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index f25339c503..fa0409dc23 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -537,7 +537,7 @@ static void update_machine_ipl_properties(IplParameterBlock *iplb) + /* Sync loadparm */ + if (iplb->flags & DIAG308_FLAGS_LP_VALID) { + uint8_t *ebcdic_loadparm = iplb->loadparm; +- char ascii_loadparm[8]; ++ char ascii_loadparm[9]; + int i; + + for (i = 0; i < 8 && ebcdic_loadparm[i]; i++) { +-- +2.27.0 + diff --git a/kvm-s390-ipl-sync-back-loadparm.patch b/kvm-s390-ipl-sync-back-loadparm.patch new file mode 100755 index 0000000..49f4d3f --- /dev/null +++ b/kvm-s390-ipl-sync-back-loadparm.patch @@ -0,0 +1,91 @@ +From 53053ea2e6c757e5d044655c8b61c485e0aad4ed Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:59 -0400 +Subject: [PATCH 17/42] s390/ipl: sync back loadparm +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-18-thuth@redhat.com> +Patchwork-id: 97039 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 17/38] s390/ipl: sync back loadparm +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Halil Pasic + +We expose loadparm as a r/w machine property, but if loadparm is set by +the guest via DIAG 308, we don't update the property. Having a +disconnect between the guest view and the QEMU property is not nice in +itself, but things get even worse for SCSI, where under certain +circumstances (see 789b5a401b "s390: Ensure IPL from SCSI works as +expected" for details) we call s390_gen_initial_iplb() on resets +effectively overwriting the guest/user supplied loadparm with the stale +value. + +Signed-off-by: Halil Pasic +Fixes: 7104bae9de ("hw/s390x: provide loadparm property for the machine") +Reported-by: Marc Hartmayer +Reviewed-by: Janosch Frank +Reviewed-by: Viktor Mihajlovski +Tested-by: Marc Hartmayer +Reviewed-by: David Hildenbrand +Message-Id: <20200309133223.100491-1-pasic@linux.ibm.com> +[borntraeger@de.ibm.com: use reverse xmas tree] +Signed-off-by: Christian Borntraeger +(cherry picked from commit 0a01e082a428b921e48b5314881b1f23a7b0fe50) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index 0b7548a549..f25339c503 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -529,6 +529,30 @@ static bool is_virtio_scsi_device(IplParameterBlock *iplb) + return is_virtio_ccw_device_of_type(iplb, VIRTIO_ID_SCSI); + } + ++static void update_machine_ipl_properties(IplParameterBlock *iplb) ++{ ++ Object *machine = qdev_get_machine(); ++ Error *err = NULL; ++ ++ /* Sync loadparm */ ++ if (iplb->flags & DIAG308_FLAGS_LP_VALID) { ++ uint8_t *ebcdic_loadparm = iplb->loadparm; ++ char ascii_loadparm[8]; ++ int i; ++ ++ for (i = 0; i < 8 && ebcdic_loadparm[i]; i++) { ++ ascii_loadparm[i] = ebcdic2ascii[(uint8_t) ebcdic_loadparm[i]]; ++ } ++ ascii_loadparm[i] = 0; ++ object_property_set_str(machine, ascii_loadparm, "loadparm", &err); ++ } else { ++ object_property_set_str(machine, "", "loadparm", &err); ++ } ++ if (err) { ++ warn_report_err(err); ++ } ++} ++ + void s390_ipl_update_diag308(IplParameterBlock *iplb) + { + S390IPLState *ipl = get_ipl_device(); +@@ -536,6 +560,7 @@ void s390_ipl_update_diag308(IplParameterBlock *iplb) + ipl->iplb = *iplb; + ipl->iplb_valid = true; + ipl->netboot = is_virtio_net_device(iplb); ++ update_machine_ipl_properties(iplb); + } + + IplParameterBlock *s390_ipl_get_iplb(void) +-- +2.27.0 + diff --git a/kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch b/kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch new file mode 100755 index 0000000..f0f25a5 --- /dev/null +++ b/kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch @@ -0,0 +1,163 @@ +From a0ad4344984c50939be8c99371af0988551fb776 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 20 Nov 2020 11:46:09 -0500 +Subject: [PATCH 17/18] s390/kvm: fix diag318 propagation and reset + functionality + +RH-Author: Thomas Huth +Message-id: <20201120114609.408610-2-thuth@redhat.com> +Patchwork-id: 99787 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] s390/kvm: fix diag318 propagation and reset functionality +Bugzilla: 1659412 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The Control Program Name Code (CPNC) portion of the diag318 +info must be set within the SIE block of each VCPU in the +configuration. The handler will iterate through each VCPU +and dirty the diag318_info reg to be synced with KVM on a +subsequent sync_regs call. + +Additionally, the diag318 info resets must be handled via +userspace. As such, QEMU will reset this value for each +VCPU during a modified clear, load normal, and load clear +reset event. + +Fixes: fabdada9357b ("s390: guest support for diagnose 0x318") +Signed-off-by: Collin Walling +Message-Id: <20201113221022.257054-1-walling@linux.ibm.com> +Reviewed-by: Thomas Huth +Reviewed-by: Janosch Frank +Signed-off-by: Cornelia Huck +(cherry picked from commit e2c6cd567422bfa563be026b9741a1854aecdc06) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 4 ++++ + target/s390x/cpu.c | 7 +++++++ + target/s390x/cpu.h | 1 + + target/s390x/kvm-stub.c | 4 ++++ + target/s390x/kvm.c | 22 +++++++++++++++++----- + target/s390x/kvm_s390x.h | 1 + + 6 files changed, 34 insertions(+), 5 deletions(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index e6ed13b649a..5905d2b7adc 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -489,6 +489,10 @@ static void s390_machine_reset(MachineState *machine) + default: + g_assert_not_reached(); + } ++ ++ CPU_FOREACH(t) { ++ run_on_cpu(t, s390_do_cpu_set_diag318, RUN_ON_CPU_HOST_ULONG(0)); ++ } + s390_ipl_clear_reset_request(); + } + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 371b91b2d72..820cab96e12 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -445,6 +445,13 @@ void s390_enable_css_support(S390CPU *cpu) + kvm_s390_enable_css_support(cpu); + } + } ++ ++void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg) ++{ ++ if (kvm_enabled()) { ++ kvm_s390_set_diag318(cs, arg.host_ulong); ++ } ++} + #endif + + static gchar *s390_gdb_arch_name(CPUState *cs) +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 1dc21cd311d..83a23a11b96 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -774,6 +774,7 @@ int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit); + void s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void s390_cmma_reset(void); + void s390_enable_css_support(S390CPU *cpu); ++void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg); + int s390_assign_subch_ioeventfd(EventNotifier *notifier, uint32_t sch_id, + int vq, bool assign); + #ifndef CONFIG_USER_ONLY +diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c +index aa185017a2a..9970b5a8c70 100644 +--- a/target/s390x/kvm-stub.c ++++ b/target/s390x/kvm-stub.c +@@ -120,3 +120,7 @@ void kvm_s390_stop_interrupt(S390CPU *cpu) + void kvm_s390_restart_interrupt(S390CPU *cpu) + { + } ++ ++void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info) ++{ ++} +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 6edb52f6d25..8d4406124b9 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -1611,10 +1611,23 @@ static int handle_sw_breakpoint(S390CPU *cpu, struct kvm_run *run) + return -ENOENT; + } + ++void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info) ++{ ++ CPUS390XState *env = &S390_CPU(cs)->env; ++ ++ /* Feat bit is set only if KVM supports sync for diag318 */ ++ if (s390_has_feat(S390_FEAT_DIAG_318)) { ++ env->diag318_info = diag318_info; ++ cs->kvm_run->s.regs.diag318 = diag318_info; ++ cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ } ++} ++ + static void handle_diag_318(S390CPU *cpu, struct kvm_run *run) + { + uint64_t reg = (run->s390_sieic.ipa & 0x00f0) >> 4; + uint64_t diag318_info = run->s.regs.gprs[reg]; ++ CPUState *t; + + /* + * DIAG 318 can only be enabled with KVM support. As such, let's +@@ -1622,13 +1635,12 @@ static void handle_diag_318(S390CPU *cpu, struct kvm_run *run) + */ + if (!s390_has_feat(S390_FEAT_DIAG_318)) { + kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION); ++ return; + } + +- cpu->env.diag318_info = diag318_info; +- +- if (can_sync_regs(CPU(cpu), KVM_SYNC_DIAG318)) { +- run->s.regs.diag318 = diag318_info; +- run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ CPU_FOREACH(t) { ++ run_on_cpu(t, s390_do_cpu_set_diag318, ++ RUN_ON_CPU_HOST_ULONG(diag318_info)); + } + } + +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index 6ab17c81b73..25bbe98b251 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -45,5 +45,6 @@ void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void kvm_s390_crypto_reset(void); + void kvm_s390_restart_interrupt(S390CPU *cpu); + void kvm_s390_stop_interrupt(S390CPU *cpu); ++void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info); + + #endif /* KVM_S390X_H */ +-- +2.27.0 + diff --git a/kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch b/kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch new file mode 100755 index 0000000..c05f50c --- /dev/null +++ b/kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch @@ -0,0 +1,220 @@ +From e1a3684f9b08fa9db35331b5c5ad11879f512e90 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:11 -0500 +Subject: [PATCH 11/18] s390/sclp: add extended-length sccb support for kvm + guest + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-8-thuth@redhat.com> +Patchwork-id: 99504 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 07/12] s390/sclp: add extended-length sccb support for kvm guest +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +As more features and facilities are added to the Read SCP Info (RSCPI) +response, more space is required to store them. The space used to store +these new features intrudes on the space originally used to store CPU +entries. This means as more features and facilities are added to the +RSCPI response, less space can be used to store CPU entries. + +With the Extended-Length SCCB (ELS) facility, a KVM guest can execute +the RSCPI command and determine if the SCCB is large enough to store a +complete reponse. If it is not large enough, then the required length +will be set in the SCCB header. + +The caller of the SCLP command is responsible for creating a +large-enough SCCB to store a complete response. Proper checking should +be in place, and the caller should execute the command once-more with +the large-enough SCCB. + +This facility also enables an extended SCCB for the Read CPU Info +(RCPUI) command. + +When this facility is enabled, the boundary violation response cannot +be a result from the RSCPI, RSCPI Forced, or RCPUI commands. + +In order to tolerate kernels that do not yet have full support for this +feature, a "fixed" offset to the start of the CPU Entries within the +Read SCP Info struct is set to allow for the original 248 max entries +when this feature is disabled. + +Additionally, this is introduced as a CPU feature to protect the guest +from migrating to a machine that does not support storing an extended +SCCB. This could otherwise hinder the VM from being able to read all +available CPU entries after migration (such as during re-ipl). + +Signed-off-by: Collin Walling +Reviewed-by: Thomas Huth +Acked-by: Cornelia Huck +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-7-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 1ecd6078f587cfadda8edc93d45b5072e35f2d17) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 43 +++++++++++++++++++++++++---- + include/hw/s390x/sclp.h | 1 + + target/s390x/cpu_features_def.inc.h | 1 + + target/s390x/gen-features.c | 1 + + target/s390x/kvm.c | 8 ++++++ + 5 files changed, 48 insertions(+), 6 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 017989b3888..8d111628e04 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -49,13 +49,30 @@ static inline bool sclp_command_code_valid(uint32_t code) + return false; + } + +-static bool sccb_verify_boundary(uint64_t sccb_addr, uint16_t sccb_len) ++static bool sccb_verify_boundary(uint64_t sccb_addr, uint16_t sccb_len, ++ uint32_t code) + { + uint64_t sccb_max_addr = sccb_addr + sccb_len - 1; + uint64_t sccb_boundary = (sccb_addr & PAGE_MASK) + PAGE_SIZE; + +- if (sccb_max_addr < sccb_boundary) { +- return true; ++ switch (code & SCLP_CMD_CODE_MASK) { ++ case SCLP_CMDW_READ_SCP_INFO: ++ case SCLP_CMDW_READ_SCP_INFO_FORCED: ++ case SCLP_CMDW_READ_CPU_INFO: ++ /* ++ * An extended-length SCCB is only allowed for Read SCP/CPU Info and ++ * is allowed to exceed the 4k boundary. The respective commands will ++ * set the length field to the required length if an insufficient ++ * SCCB length is provided. ++ */ ++ if (s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB)) { ++ return true; ++ } ++ /* fallthrough */ ++ default: ++ if (sccb_max_addr < sccb_boundary) { ++ return true; ++ } + } + + return false; +@@ -80,6 +97,12 @@ static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + + #define SCCB_REQ_LEN(s, max_cpus) (sizeof(s) + max_cpus * sizeof(CPUEntry)) + ++static inline bool ext_len_sccb_supported(SCCBHeader header) ++{ ++ return s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) && ++ header.control_mask[2] & SCLP_VARIABLE_LENGTH_RESPONSE; ++} ++ + /* Provide information about the configuration, CPUs and storage */ + static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + { +@@ -89,10 +112,15 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + int rnsize, rnmax; + IplParameterBlock *ipib = s390_ipl_get_iplb(); + int required_len = SCCB_REQ_LEN(ReadInfo, machine->possible_cpus->len); +- int offset_cpu = offsetof(ReadInfo, entries); ++ int offset_cpu = s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) ? ++ offsetof(ReadInfo, entries) : ++ SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET; + CPUEntry *entries_start = (void *)sccb + offset_cpu; + + if (be16_to_cpu(sccb->h.length) < required_len) { ++ if (ext_len_sccb_supported(sccb->h)) { ++ sccb->h.length = cpu_to_be16(required_len); ++ } + sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); + return; + } +@@ -153,6 +181,9 @@ static void sclp_read_cpu_info(SCLPDevice *sclp, SCCB *sccb) + int required_len = SCCB_REQ_LEN(ReadCpuInfo, machine->possible_cpus->len); + + if (be16_to_cpu(sccb->h.length) < required_len) { ++ if (ext_len_sccb_supported(sccb->h)) { ++ sccb->h.length = cpu_to_be16(required_len); ++ } + sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); + return; + } +@@ -249,7 +280,7 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length), code)) { + work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } +@@ -302,7 +333,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length), code)) { + work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index 55f53a46540..df2fa4169b0 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -110,6 +110,7 @@ typedef struct CPUEntry { + uint8_t reserved1; + } QEMU_PACKED CPUEntry; + ++#define SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET 128 + typedef struct ReadInfo { + SCCBHeader h; + uint16_t rnmax; +diff --git a/target/s390x/cpu_features_def.inc.h b/target/s390x/cpu_features_def.inc.h +index 60db28351d0..3548d65a69a 100644 +--- a/target/s390x/cpu_features_def.inc.h ++++ b/target/s390x/cpu_features_def.inc.h +@@ -97,6 +97,7 @@ DEF_FEAT(GUARDED_STORAGE, "gs", STFL, 133, "Guarded-storage facility") + DEF_FEAT(VECTOR_PACKED_DECIMAL, "vxpd", STFL, 134, "Vector packed decimal facility") + DEF_FEAT(VECTOR_ENH, "vxeh", STFL, 135, "Vector enhancements facility") + DEF_FEAT(MULTIPLE_EPOCH, "mepoch", STFL, 139, "Multiple-epoch facility") ++DEF_FEAT(EXTENDED_LENGTH_SCCB, "els", STFL, 140, "Extended-length SCCB facility") + DEF_FEAT(TEST_PENDING_EXT_INTERRUPTION, "tpei", STFL, 144, "Test-pending-external-interruption facility") + DEF_FEAT(INSERT_REFERENCE_BITS_MULT, "irbm", STFL, 145, "Insert-reference-bits-multiple facility") + DEF_FEAT(MSA_EXT_8, "msa8-base", STFL, 146, "Message-security-assist-extension-8 facility (excluding subfunctions)") +diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c +index 8ddeebc5441..6857f657fba 100644 +--- a/target/s390x/gen-features.c ++++ b/target/s390x/gen-features.c +@@ -522,6 +522,7 @@ static uint16_t full_GEN12_GA1[] = { + S390_FEAT_AP_QUEUE_INTERRUPT_CONTROL, + S390_FEAT_AP_FACILITIES_TEST, + S390_FEAT_AP, ++ S390_FEAT_EXTENDED_LENGTH_SCCB, + }; + + static uint16_t full_GEN12_GA2[] = { +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 0bbf8f81b09..ef437acb5c1 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2441,6 +2441,14 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + KVM_S390_VM_CRYPTO_ENABLE_APIE)) { + set_bit(S390_FEAT_AP, model->features); + } ++ ++ /* ++ * Extended-Length SCCB is handled entirely within QEMU. ++ * For PV guests this is completely fenced by the Ultravisor, as Service ++ * Call error checking and STFLE interpretation are handled via SIE. ++ */ ++ set_bit(S390_FEAT_EXTENDED_LENGTH_SCCB, model->features); ++ + /* strip of features that are not part of the maximum model */ + bitmap_and(model->features, model->features, model->def->full_feat, + S390_FEAT_MAX); +-- +2.27.0 + diff --git a/kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch b/kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch new file mode 100755 index 0000000..6efc35f --- /dev/null +++ b/kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch @@ -0,0 +1,106 @@ +From 6cc7c8dd7a6fac493c648c607bec4c38c0b275b6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:09 -0500 +Subject: [PATCH 09/18] s390/sclp: check sccb len before filling in data + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-6-thuth@redhat.com> +Patchwork-id: 99502 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 05/12] s390/sclp: check sccb len before filling in data +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The SCCB must be checked for a sufficient length before it is filled +with any data. If the length is insufficient, then the SCLP command +is suppressed and the proper response code is set in the SCCB header. + +While we're at it, let's cleanup the length check by placing the +calculation inside a macro. + +Fixes: 832be0d8a3bb ("s390x: sclp: Report insufficient SCCB length") +Signed-off-by: Collin Walling +Reviewed-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Reviewed-by: Thomas Huth +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-5-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0260b97824495ebfacfa8bbae0be10b0ef986bf6) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index cf1292beb22..2b4c6c5cfad 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -78,6 +78,8 @@ static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + } + } + ++#define SCCB_REQ_LEN(s, max_cpus) (sizeof(s) + max_cpus * sizeof(CPUEntry)) ++ + /* Provide information about the configuration, CPUs and storage */ + static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + { +@@ -86,6 +88,12 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + int cpu_count; + int rnsize, rnmax; + IplParameterBlock *ipib = s390_ipl_get_iplb(); ++ int required_len = SCCB_REQ_LEN(ReadInfo, machine->possible_cpus->len); ++ ++ if (be16_to_cpu(sccb->h.length) < required_len) { ++ sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); ++ return; ++ } + + /* CPU information */ + prepare_cpu_entries(machine, read_info->entries, &cpu_count); +@@ -95,12 +103,6 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + + read_info->ibc_val = cpu_to_be32(s390_get_ibc_val()); + +- if (be16_to_cpu(sccb->h.length) < +- (sizeof(ReadInfo) + cpu_count * sizeof(CPUEntry))) { +- sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); +- return; +- } +- + /* Configuration Characteristic (Extension) */ + s390_get_feat_block(S390_FEAT_TYPE_SCLP_CONF_CHAR, + read_info->conf_char); +@@ -146,18 +148,18 @@ static void sclp_read_cpu_info(SCLPDevice *sclp, SCCB *sccb) + MachineState *machine = MACHINE(qdev_get_machine()); + ReadCpuInfo *cpu_info = (ReadCpuInfo *) sccb; + int cpu_count; ++ int required_len = SCCB_REQ_LEN(ReadCpuInfo, machine->possible_cpus->len); ++ ++ if (be16_to_cpu(sccb->h.length) < required_len) { ++ sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); ++ return; ++ } + + prepare_cpu_entries(machine, cpu_info->entries, &cpu_count); + cpu_info->nr_configured = cpu_to_be16(cpu_count); + cpu_info->offset_configured = cpu_to_be16(offsetof(ReadCpuInfo, entries)); + cpu_info->nr_standby = cpu_to_be16(0); + +- if (be16_to_cpu(sccb->h.length) < +- (sizeof(ReadCpuInfo) + cpu_count * sizeof(CPUEntry))) { +- sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); +- return; +- } +- + /* The standby offset is 16-byte for each CPU */ + cpu_info->offset_standby = cpu_to_be16(cpu_info->offset_configured + + cpu_info->nr_configured*sizeof(CPUEntry)); +-- +2.27.0 + diff --git a/kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch b/kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch new file mode 100755 index 0000000..09c72b6 --- /dev/null +++ b/kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch @@ -0,0 +1,75 @@ +From 44e8cdba29b932ee6fff7a2d00b09e6e78c3a0ef Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:06 -0500 +Subject: [PATCH 06/18] s390/sclp: get machine once during read scp/cpu info + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-3-thuth@redhat.com> +Patchwork-id: 99499 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 02/12] s390/sclp: get machine once during read scp/cpu info +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +Functions within read scp/cpu info will need access to the machine +state. Let's make a call to retrieve the machine state once and +pass the appropriate data to the respective functions. + +Signed-off-by: Collin Walling +Reviewed-by: David Hildenbrand +Reviewed-by: Thomas Huth +Reviewed-by: Janosch Frank +Reviewed-by: Cornelia Huck +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-2-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 912d70d2755cb9b3144eeed4014580ebc5485ce6) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index d8ae207731f..fe7d0fece80 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -49,9 +49,8 @@ static inline bool sclp_command_code_valid(uint32_t code) + return false; + } + +-static void prepare_cpu_entries(SCLPDevice *sclp, CPUEntry *entry, int *count) ++static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + { +- MachineState *ms = MACHINE(qdev_get_machine()); + uint8_t features[SCCB_CPU_FEATURE_LEN] = { 0 }; + int i; + +@@ -77,7 +76,7 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + IplParameterBlock *ipib = s390_ipl_get_iplb(); + + /* CPU information */ +- prepare_cpu_entries(sclp, read_info->entries, &cpu_count); ++ prepare_cpu_entries(machine, read_info->entries, &cpu_count); + read_info->entries_cpu = cpu_to_be16(cpu_count); + read_info->offset_cpu = cpu_to_be16(offsetof(ReadInfo, entries)); + read_info->highest_cpu = cpu_to_be16(machine->smp.max_cpus - 1); +@@ -132,10 +131,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + /* Provide information about the CPU */ + static void sclp_read_cpu_info(SCLPDevice *sclp, SCCB *sccb) + { ++ MachineState *machine = MACHINE(qdev_get_machine()); + ReadCpuInfo *cpu_info = (ReadCpuInfo *) sccb; + int cpu_count; + +- prepare_cpu_entries(sclp, cpu_info->entries, &cpu_count); ++ prepare_cpu_entries(machine, cpu_info->entries, &cpu_count); + cpu_info->nr_configured = cpu_to_be16(cpu_count); + cpu_info->offset_configured = cpu_to_be16(offsetof(ReadCpuInfo, entries)); + cpu_info->nr_standby = cpu_to_be16(0); +-- +2.27.0 + diff --git a/kvm-s390-sclp-improve-special-wait-psw-logic.patch b/kvm-s390-sclp-improve-special-wait-psw-logic.patch new file mode 100755 index 0000000..2040d5c --- /dev/null +++ b/kvm-s390-sclp-improve-special-wait-psw-logic.patch @@ -0,0 +1,52 @@ +From cd7da3cf1b19fef0a497fd556562040a85e579a7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:57 -0400 +Subject: [PATCH 15/42] s390/sclp: improve special wait psw logic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-16-thuth@redhat.com> +Patchwork-id: 97037 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 15/38] s390/sclp: improve special wait psw logic +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +There is a special quiesce PSW that we check for "shutdown". Otherwise disabled +wait is detected as "crashed". Architecturally we must only check PSW bits +116-127. Fix this. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Christian Borntraeger +Message-Id: <1582204582-22995-1-git-send-email-borntraeger@de.ibm.com> +Reviewed-by: David Hildenbrand +Acked-by: Janosch Frank +Signed-off-by: Cornelia Huck +(cherry picked from commit 8b51c0961cc13e55b26bb6665ec3a341abdc7658) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/helper.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/target/s390x/helper.c b/target/s390x/helper.c +index a3a49164e4..6808dfda01 100644 +--- a/target/s390x/helper.c ++++ b/target/s390x/helper.c +@@ -89,7 +89,7 @@ hwaddr s390_cpu_get_phys_addr_debug(CPUState *cs, vaddr vaddr) + static inline bool is_special_wait_psw(uint64_t psw_addr) + { + /* signal quiesce */ +- return psw_addr == 0xfffUL; ++ return (psw_addr & 0xfffUL) == 0xfffUL; + } + + void s390_handle_wait(S390CPU *cpu) +-- +2.27.0 + diff --git a/kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch b/kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch new file mode 100755 index 0000000..adb65c7 --- /dev/null +++ b/kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch @@ -0,0 +1,170 @@ +From 212c129b82f0a53725a4167303de2ee0a865f82d Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:08 -0500 +Subject: [PATCH 08/18] s390/sclp: read sccb from mem based on provided length + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-5-thuth@redhat.com> +Patchwork-id: 99501 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 04/12] s390/sclp: read sccb from mem based on provided length +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The header contained within the SCCB passed to the SCLP service call +contains the actual length of the SCCB. Instead of allocating a static +4K size for the work sccb, let's allow for a variable size determined +by the value in the header. The proper checks are already in place to +ensure the SCCB length is sufficent to store a full response and that +the length does not cross any explicitly-set boundaries. + +Signed-off-by: Collin Walling +Reviewed-by: Thomas Huth +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-4-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit c1db53a5910f988eeb32f031c53a50f3373fd824) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/event-facility.c | 2 +- + hw/s390x/sclp.c | 55 ++++++++++++++++++++++----------------- + include/hw/s390x/sclp.h | 2 +- + 3 files changed, 33 insertions(+), 26 deletions(-) + +diff --git a/hw/s390x/event-facility.c b/hw/s390x/event-facility.c +index 66205697ae7..8aa7017f06b 100644 +--- a/hw/s390x/event-facility.c ++++ b/hw/s390x/event-facility.c +@@ -215,7 +215,7 @@ static uint16_t handle_sccb_read_events(SCLPEventFacility *ef, SCCB *sccb, + + event_buf = &red->ebh; + event_buf->length = 0; +- slen = sizeof(sccb->data); ++ slen = sccb_data_len(sccb); + + rc = SCLP_RC_NO_EVENT_BUFFERS_STORED; + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 38278497319..cf1292beb22 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -231,25 +231,29 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + { + SCLPDevice *sclp = get_sclp_device(); + SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); +- SCCB work_sccb; +- hwaddr sccb_len = sizeof(SCCB); ++ SCCBHeader header; ++ g_autofree SCCB *work_sccb = NULL; + +- s390_cpu_pv_mem_read(env_archcpu(env), 0, &work_sccb, sccb_len); ++ s390_cpu_pv_mem_read(env_archcpu(env), 0, &header, sizeof(SCCBHeader)); ++ ++ work_sccb = g_malloc0(be16_to_cpu(header.length)); ++ s390_cpu_pv_mem_read(env_archcpu(env), 0, work_sccb, ++ be16_to_cpu(header.length)); + + if (!sclp_command_code_valid(code)) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } + +- sclp_c->execute(sclp, &work_sccb, code); ++ sclp_c->execute(sclp, work_sccb, code); + out_write: +- s390_cpu_pv_mem_write(env_archcpu(env), 0, &work_sccb, +- be16_to_cpu(work_sccb.h.length)); ++ s390_cpu_pv_mem_write(env_archcpu(env), 0, work_sccb, ++ be16_to_cpu(work_sccb->h.length)); + sclp_c->service_interrupt(sclp, SCLP_PV_DUMMY_ADDR); + return 0; + } +@@ -258,9 +262,8 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + { + SCLPDevice *sclp = get_sclp_device(); + SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); +- SCCB work_sccb; +- +- hwaddr sccb_len = sizeof(SCCB); ++ SCCBHeader header; ++ g_autofree SCCB *work_sccb = NULL; + + /* first some basic checks on program checks */ + if (env->psw.mask & PSW_MASK_PSTATE) { +@@ -274,32 +277,36 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + return -PGM_SPECIFICATION; + } + ++ /* the header contains the actual length of the sccb */ ++ cpu_physical_memory_read(sccb, &header, sizeof(SCCBHeader)); ++ ++ /* Valid sccb sizes */ ++ if (be16_to_cpu(header.length) < sizeof(SCCBHeader)) { ++ return -PGM_SPECIFICATION; ++ } ++ + /* + * we want to work on a private copy of the sccb, to prevent guests + * from playing dirty tricks by modifying the memory content after + * the host has checked the values + */ +- cpu_physical_memory_read(sccb, &work_sccb, sccb_len); +- +- /* Valid sccb sizes */ +- if (be16_to_cpu(work_sccb.h.length) < sizeof(SCCBHeader)) { +- return -PGM_SPECIFICATION; +- } ++ work_sccb = g_malloc0(be16_to_cpu(header.length)); ++ cpu_physical_memory_read(sccb, work_sccb, be16_to_cpu(header.length)); + + if (!sclp_command_code_valid(code)) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } + +- sclp_c->execute(sclp, &work_sccb, code); ++ sclp_c->execute(sclp, work_sccb, code); + out_write: +- cpu_physical_memory_write(sccb, &work_sccb, +- be16_to_cpu(work_sccb.h.length)); ++ cpu_physical_memory_write(sccb, work_sccb, ++ be16_to_cpu(work_sccb->h.length)); + + sclp_c->service_interrupt(sclp, sccb); + +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index c0a3faa37d7..55f53a46540 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -177,7 +177,7 @@ typedef struct IoaCfgSccb { + + typedef struct SCCB { + SCCBHeader h; +- char data[SCCB_DATA_LEN]; ++ char data[]; + } QEMU_PACKED SCCB; + + #define TYPE_SCLP "sclp" +-- +2.27.0 + diff --git a/kvm-s390-sclp-rework-sclp-boundary-checks.patch b/kvm-s390-sclp-rework-sclp-boundary-checks.patch new file mode 100755 index 0000000..9bb3a55 --- /dev/null +++ b/kvm-s390-sclp-rework-sclp-boundary-checks.patch @@ -0,0 +1,80 @@ +From bc395a979a00bb3e16f3bd92b5b2006db4a5aee3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:07 -0500 +Subject: [PATCH 07/18] s390/sclp: rework sclp boundary checks + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-4-thuth@redhat.com> +Patchwork-id: 99500 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 03/12] s390/sclp: rework sclp boundary checks +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +Rework the SCLP boundary check to account for different SCLP commands +(eventually) allowing different boundary sizes. + +Signed-off-by: Collin Walling +Reviewed-by: Cornelia Huck +Reviewed-by: Thomas Huth +Acked-by: Janosch Frank +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-3-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit db13387ca01a69d870cc16dd232375c2603596f2) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index fe7d0fece80..38278497319 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -49,6 +49,18 @@ static inline bool sclp_command_code_valid(uint32_t code) + return false; + } + ++static bool sccb_verify_boundary(uint64_t sccb_addr, uint16_t sccb_len) ++{ ++ uint64_t sccb_max_addr = sccb_addr + sccb_len - 1; ++ uint64_t sccb_boundary = (sccb_addr & PAGE_MASK) + PAGE_SIZE; ++ ++ if (sccb_max_addr < sccb_boundary) { ++ return true; ++ } ++ ++ return false; ++} ++ + static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + { + uint8_t features[SCCB_CPU_FEATURE_LEN] = { 0 }; +@@ -229,6 +241,11 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + goto out_write; + } + ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { ++ work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); ++ goto out_write; ++ } ++ + sclp_c->execute(sclp, &work_sccb, code); + out_write: + s390_cpu_pv_mem_write(env_archcpu(env), 0, &work_sccb, +@@ -274,7 +291,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + goto out_write; + } + +- if ((sccb + be16_to_cpu(work_sccb.h.length)) > ((sccb & PAGE_MASK) + PAGE_SIZE)) { ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { + work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } +-- +2.27.0 + diff --git a/kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch b/kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch new file mode 100755 index 0000000..cb99830 --- /dev/null +++ b/kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch @@ -0,0 +1,67 @@ +From adf66c037e60d66f864960b24c746b767efb10b9 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:10 -0500 +Subject: [PATCH 10/18] s390/sclp: use cpu offset to locate cpu entries + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-7-thuth@redhat.com> +Patchwork-id: 99503 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 06/12] s390/sclp: use cpu offset to locate cpu entries +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The start of the CPU entry region in the Read SCP Info response data is +denoted by the offset_cpu field. As such, QEMU needs to begin creating +entries at this address. + +This is in preparation for when Read SCP Info inevitably introduces new +bytes that push the start of the CPUEntry field further away. + +Read CPU Info is unlikely to ever change, so let's not bother +accounting for the offset there. + +Signed-off-by: Collin Walling +Reviewed-by: Thomas Huth +Reviewed-by: Cornelia Huck +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-6-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 1a7a568859473b1cda39a015493c5c82bb200281) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 2b4c6c5cfad..017989b3888 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -89,6 +89,8 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + int rnsize, rnmax; + IplParameterBlock *ipib = s390_ipl_get_iplb(); + int required_len = SCCB_REQ_LEN(ReadInfo, machine->possible_cpus->len); ++ int offset_cpu = offsetof(ReadInfo, entries); ++ CPUEntry *entries_start = (void *)sccb + offset_cpu; + + if (be16_to_cpu(sccb->h.length) < required_len) { + sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); +@@ -96,9 +98,9 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + } + + /* CPU information */ +- prepare_cpu_entries(machine, read_info->entries, &cpu_count); ++ prepare_cpu_entries(machine, entries_start, &cpu_count); + read_info->entries_cpu = cpu_to_be16(cpu_count); +- read_info->offset_cpu = cpu_to_be16(offsetof(ReadInfo, entries)); ++ read_info->offset_cpu = cpu_to_be16(offset_cpu); + read_info->highest_cpu = cpu_to_be16(machine->smp.max_cpus - 1); + + read_info->ibc_val = cpu_to_be32(s390_get_ibc_val()); +-- +2.27.0 + diff --git a/kvm-s390x-Add-SIDA-memory-ops.patch b/kvm-s390x-Add-SIDA-memory-ops.patch new file mode 100755 index 0000000..1b566d7 --- /dev/null +++ b/kvm-s390x-Add-SIDA-memory-ops.patch @@ -0,0 +1,150 @@ +From ebcd74c2267d69fe09ca03cb8bfed7bef5ea3a85 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:08 -0400 +Subject: [PATCH 26/42] s390x: Add SIDA memory ops + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-27-thuth@redhat.com> +Patchwork-id: 97033 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 26/38] s390x: Add SIDA memory ops +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Protected guests save the instruction control blocks in the SIDA +instead of QEMU/KVM directly accessing the guest's memory. + +Let's introduce new functions to access the SIDA. + +The memops for doing so are available with KVM_CAP_S390_PROTECTED, so +let's check for that. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-8-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 1cca8265499d394d9ed4bfb75bd6e7265b529f89) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.h | 7 ++++++- + target/s390x/kvm.c | 26 ++++++++++++++++++++++++++ + target/s390x/kvm_s390x.h | 2 ++ + target/s390x/mmu_helper.c | 14 ++++++++++++++ + 4 files changed, 48 insertions(+), 1 deletion(-) + +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 1ff84e6b3a..edf8391504 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -828,7 +828,12 @@ int s390_cpu_virt_mem_rw(S390CPU *cpu, vaddr laddr, uint8_t ar, void *hostbuf, + #define s390_cpu_virt_mem_check_write(cpu, laddr, ar, len) \ + s390_cpu_virt_mem_rw(cpu, laddr, ar, NULL, len, true) + void s390_cpu_virt_mem_handle_exc(S390CPU *cpu, uintptr_t ra); +- ++int s390_cpu_pv_mem_rw(S390CPU *cpu, unsigned int offset, void *hostbuf, ++ int len, bool is_write); ++#define s390_cpu_pv_mem_read(cpu, offset, dest, len) \ ++ s390_cpu_pv_mem_rw(cpu, offset, dest, len, false) ++#define s390_cpu_pv_mem_write(cpu, offset, dest, len) \ ++ s390_cpu_pv_mem_rw(cpu, offset, dest, len, true) + + /* sigp.c */ + int s390_cpu_restart(S390CPU *cpu); +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index af50b2c253..f67bb5ce2c 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -154,6 +154,7 @@ static int cap_ri; + static int cap_gs; + static int cap_hpage_1m; + static int cap_vcpu_resets; ++static int cap_protected; + + static int active_cmma; + +@@ -351,6 +352,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP); + cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ); + cap_vcpu_resets = kvm_check_extension(s, KVM_CAP_S390_VCPU_RESETS); ++ cap_protected = kvm_check_extension(s, KVM_CAP_S390_PROTECTED); + + if (!kvm_check_extension(s, KVM_CAP_S390_GMAP) + || !kvm_check_extension(s, KVM_CAP_S390_COW)) { +@@ -848,6 +850,30 @@ int kvm_s390_mem_op(S390CPU *cpu, vaddr addr, uint8_t ar, void *hostbuf, + return ret; + } + ++int kvm_s390_mem_op_pv(S390CPU *cpu, uint64_t offset, void *hostbuf, ++ int len, bool is_write) ++{ ++ struct kvm_s390_mem_op mem_op = { ++ .sida_offset = offset, ++ .size = len, ++ .op = is_write ? KVM_S390_MEMOP_SIDA_WRITE ++ : KVM_S390_MEMOP_SIDA_READ, ++ .buf = (uint64_t)hostbuf, ++ }; ++ int ret; ++ ++ if (!cap_mem_op || !cap_protected) { ++ return -ENOSYS; ++ } ++ ++ ret = kvm_vcpu_ioctl(CPU(cpu), KVM_S390_MEM_OP, &mem_op); ++ if (ret < 0) { ++ error_report("KVM_S390_MEM_OP failed: %s", strerror(-ret)); ++ abort(); ++ } ++ return ret; ++} ++ + /* + * Legacy layout for s390: + * Older S390 KVM requires the topmost vma of the RAM to be +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index dea813f450..6ab17c81b7 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -19,6 +19,8 @@ void kvm_s390_vcpu_interrupt(S390CPU *cpu, struct kvm_s390_irq *irq); + void kvm_s390_access_exception(S390CPU *cpu, uint16_t code, uint64_t te_code); + int kvm_s390_mem_op(S390CPU *cpu, vaddr addr, uint8_t ar, void *hostbuf, + int len, bool is_write); ++int kvm_s390_mem_op_pv(S390CPU *cpu, vaddr addr, void *hostbuf, int len, ++ bool is_write); + void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code); + int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state); + void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu); +diff --git a/target/s390x/mmu_helper.c b/target/s390x/mmu_helper.c +index c9f3f34750..ec8befbdc8 100644 +--- a/target/s390x/mmu_helper.c ++++ b/target/s390x/mmu_helper.c +@@ -474,6 +474,20 @@ static int translate_pages(S390CPU *cpu, vaddr addr, int nr_pages, + return 0; + } + ++int s390_cpu_pv_mem_rw(S390CPU *cpu, unsigned int offset, void *hostbuf, ++ int len, bool is_write) ++{ ++ int ret; ++ ++ if (kvm_enabled()) { ++ ret = kvm_s390_mem_op_pv(cpu, offset, hostbuf, len, is_write); ++ } else { ++ /* Protected Virtualization is a KVM/Hardware only feature */ ++ g_assert_not_reached(); ++ } ++ return ret; ++} ++ + /** + * s390_cpu_virt_mem_rw: + * @laddr: the logical start address +-- +2.27.0 + diff --git a/kvm-s390x-Add-missing-vcpu-reset-functions.patch b/kvm-s390x-Add-missing-vcpu-reset-functions.patch new file mode 100755 index 0000000..9ce071e --- /dev/null +++ b/kvm-s390x-Add-missing-vcpu-reset-functions.patch @@ -0,0 +1,176 @@ +From e11643b5363262e9f809762a1f2bb5c4a8f26c2a Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:56 -0400 +Subject: [PATCH 14/42] s390x: Add missing vcpu reset functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-15-thuth@redhat.com> +Patchwork-id: 97023 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 14/38] s390x: Add missing vcpu reset functions +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Up to now we only had an ioctl to reset vcpu data QEMU couldn't reach +for the initial reset, which was also called for the clear reset. To +be architecture compliant, we also need to clear local interrupts on a +normal reset. + +Because of this and the upcoming protvirt support we need to add +ioctls for the missing clear and normal resets. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Acked-by: David Hildenbrand +Message-Id: <20200214151636.8764-3-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b91a03946e0f65ddd22927dd80ca1276bf89c5af) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 14 ++++++++++++-- + target/s390x/kvm-stub.c | 10 +++++++++- + target/s390x/kvm.c | 42 ++++++++++++++++++++++++++++++++-------- + target/s390x/kvm_s390x.h | 4 +++- + 4 files changed, 58 insertions(+), 12 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index e538a4a3e2..c0dd502b84 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -144,8 +144,18 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + } + + /* Reset state inside the kernel that we cannot access yet from QEMU. */ +- if (kvm_enabled() && type != S390_CPU_RESET_NORMAL) { +- kvm_s390_reset_vcpu(cpu); ++ if (kvm_enabled()) { ++ switch (type) { ++ case S390_CPU_RESET_CLEAR: ++ kvm_s390_reset_vcpu_clear(cpu); ++ break; ++ case S390_CPU_RESET_INITIAL: ++ kvm_s390_reset_vcpu_initial(cpu); ++ break; ++ case S390_CPU_RESET_NORMAL: ++ kvm_s390_reset_vcpu_normal(cpu); ++ break; ++ } + } + } + +diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c +index 5152e2bdf1..c4cd497f85 100644 +--- a/target/s390x/kvm-stub.c ++++ b/target/s390x/kvm-stub.c +@@ -83,7 +83,15 @@ void kvm_s390_cmma_reset(void) + { + } + +-void kvm_s390_reset_vcpu(S390CPU *cpu) ++void kvm_s390_reset_vcpu_initial(S390CPU *cpu) ++{ ++} ++ ++void kvm_s390_reset_vcpu_clear(S390CPU *cpu) ++{ ++} ++ ++void kvm_s390_reset_vcpu_normal(S390CPU *cpu) + { + } + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 1c5bc7a2f9..75d82af6fc 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -151,6 +151,7 @@ static int cap_s390_irq; + static int cap_ri; + static int cap_gs; + static int cap_hpage_1m; ++static int cap_vcpu_resets; + + static int active_cmma; + +@@ -342,6 +343,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + cap_async_pf = kvm_check_extension(s, KVM_CAP_ASYNC_PF); + cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP); + cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ); ++ cap_vcpu_resets = kvm_check_extension(s, KVM_CAP_S390_VCPU_RESETS); + + if (!kvm_check_extension(s, KVM_CAP_S390_GMAP) + || !kvm_check_extension(s, KVM_CAP_S390_COW)) { +@@ -403,17 +405,41 @@ int kvm_arch_destroy_vcpu(CPUState *cs) + return 0; + } + +-void kvm_s390_reset_vcpu(S390CPU *cpu) ++static void kvm_s390_reset_vcpu(S390CPU *cpu, unsigned long type) + { + CPUState *cs = CPU(cpu); + +- /* The initial reset call is needed here to reset in-kernel +- * vcpu data that we can't access directly from QEMU +- * (i.e. with older kernels which don't support sync_regs/ONE_REG). +- * Before this ioctl cpu_synchronize_state() is called in common kvm +- * code (kvm-all) */ +- if (kvm_vcpu_ioctl(cs, KVM_S390_INITIAL_RESET, NULL)) { +- error_report("Initial CPU reset failed on CPU %i", cs->cpu_index); ++ /* ++ * The reset call is needed here to reset in-kernel vcpu data that ++ * we can't access directly from QEMU (i.e. with older kernels ++ * which don't support sync_regs/ONE_REG). Before this ioctl ++ * cpu_synchronize_state() is called in common kvm code ++ * (kvm-all). ++ */ ++ if (kvm_vcpu_ioctl(cs, type)) { ++ error_report("CPU reset failed on CPU %i type %lx", ++ cs->cpu_index, type); ++ } ++} ++ ++void kvm_s390_reset_vcpu_initial(S390CPU *cpu) ++{ ++ kvm_s390_reset_vcpu(cpu, KVM_S390_INITIAL_RESET); ++} ++ ++void kvm_s390_reset_vcpu_clear(S390CPU *cpu) ++{ ++ if (cap_vcpu_resets) { ++ kvm_s390_reset_vcpu(cpu, KVM_S390_CLEAR_RESET); ++ } else { ++ kvm_s390_reset_vcpu(cpu, KVM_S390_INITIAL_RESET); ++ } ++} ++ ++void kvm_s390_reset_vcpu_normal(S390CPU *cpu) ++{ ++ if (cap_vcpu_resets) { ++ kvm_s390_reset_vcpu(cpu, KVM_S390_NORMAL_RESET); + } + } + +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index caf985955b..0b21789796 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -34,7 +34,9 @@ int kvm_s390_assign_subch_ioeventfd(EventNotifier *notifier, uint32_t sch, + int vq, bool assign); + int kvm_s390_cmma_active(void); + void kvm_s390_cmma_reset(void); +-void kvm_s390_reset_vcpu(S390CPU *cpu); ++void kvm_s390_reset_vcpu_clear(S390CPU *cpu); ++void kvm_s390_reset_vcpu_normal(S390CPU *cpu); ++void kvm_s390_reset_vcpu_initial(S390CPU *cpu); + int kvm_s390_set_mem_limit(uint64_t new_limit, uint64_t *hw_limit); + void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void kvm_s390_crypto_reset(void); +-- +2.27.0 + diff --git a/kvm-s390x-Add-unpack-facility-feature-to-GA1.patch b/kvm-s390x-Add-unpack-facility-feature-to-GA1.patch new file mode 100755 index 0000000..8ffb7b0 --- /dev/null +++ b/kvm-s390x-Add-unpack-facility-feature-to-GA1.patch @@ -0,0 +1,76 @@ +From ab670456375f0d9b9b2d219fd497d04ec0009e1d Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:16 -0400 +Subject: [PATCH 34/42] s390x: Add unpack facility feature to GA1 + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-35-thuth@redhat.com> +Patchwork-id: 97052 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 34/38] s390x: Add unpack facility feature to GA1 +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +The unpack facility is an indication that diagnose 308 subcodes 8-10 +are available to the guest. That means, that the guest can put itself +into protected mode. + +Once it is in protected mode, the hardware stops any attempt of VM +introspection by the hypervisor. + +Some features are currently not supported in protected mode: + * vfio devices + * Migration + * Huge page backings + +Signed-off-by: Christian Borntraeger +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-17-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 572c0826615737f1c095b1b6d9e381ec40f72eb5) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/gen-features.c | 1 + + target/s390x/kvm.c | 8 ++++++++ + 2 files changed, 9 insertions(+) + +diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c +index 6278845b12..8ddeebc544 100644 +--- a/target/s390x/gen-features.c ++++ b/target/s390x/gen-features.c +@@ -562,6 +562,7 @@ static uint16_t full_GEN15_GA1[] = { + S390_FEAT_GROUP_MSA_EXT_9, + S390_FEAT_GROUP_MSA_EXT_9_PCKMO, + S390_FEAT_ETOKEN, ++ S390_FEAT_UNPACK, + }; + + /* Default features (in order of release) +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 56fe60c49c..84d7cadd09 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2407,6 +2407,14 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + clear_bit(S390_FEAT_BPB, model->features); + } + ++ /* ++ * If we have support for protected virtualization, indicate ++ * the protected virtualization IPL unpack facility. ++ */ ++ if (cap_protected) { ++ set_bit(S390_FEAT_UNPACK, model->features); ++ } ++ + /* We emulate a zPCI bus and AEN, therefore we don't need HW support */ + set_bit(S390_FEAT_ZPCI, model->features); + set_bit(S390_FEAT_ADAPTER_EVENT_NOTIFICATION, model->features); +-- +2.27.0 + diff --git a/kvm-s390x-Beautify-diag308-handling.patch b/kvm-s390x-Beautify-diag308-handling.patch new file mode 100755 index 0000000..2ffe6a3 --- /dev/null +++ b/kvm-s390x-Beautify-diag308-handling.patch @@ -0,0 +1,130 @@ +From da81f2b579987ea12929f0ec803716bc16a93df7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:49 -0400 +Subject: [PATCH 07/42] s390x: Beautify diag308 handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-8-thuth@redhat.com> +Patchwork-id: 97022 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 07/38] s390x: Beautify diag308 handling +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's improve readability by: +* Using constants for the subcodes +* Moving parameter checking into a function +* Removing subcode > 6 check as the default case catches that + +Signed-off-by: Janosch Frank +Reviewed-by: Cornelia Huck +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Message-Id: <20191127175046.4911-6-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0b7fd817e0f383760e37ca9286150d5816cf0594) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/diag.c | 54 +++++++++++++++++++++++++++------------------ + 1 file changed, 32 insertions(+), 22 deletions(-) + +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index 53c2f81f2a..b5aec06d6b 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -53,6 +53,29 @@ int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + #define DIAG_308_RC_NO_CONF 0x0102 + #define DIAG_308_RC_INVALID 0x0402 + ++#define DIAG308_RESET_MOD_CLR 0 ++#define DIAG308_RESET_LOAD_NORM 1 ++#define DIAG308_LOAD_CLEAR 3 ++#define DIAG308_LOAD_NORMAL_DUMP 4 ++#define DIAG308_SET 5 ++#define DIAG308_STORE 6 ++ ++static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, ++ uintptr_t ra, bool write) ++{ ++ if ((r1 & 1) || (addr & ~TARGET_PAGE_MASK)) { ++ s390_program_interrupt(env, PGM_SPECIFICATION, ra); ++ return -1; ++ } ++ if (!address_space_access_valid(&address_space_memory, addr, ++ sizeof(IplParameterBlock), write, ++ MEMTXATTRS_UNSPECIFIED)) { ++ s390_program_interrupt(env, PGM_ADDRESSING, ra); ++ return -1; ++ } ++ return 0; ++} ++ + void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + { + CPUState *cs = env_cpu(env); +@@ -65,30 +88,24 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + return; + } + +- if ((subcode & ~0x0ffffULL) || (subcode > 6)) { ++ if (subcode & ~0x0ffffULL) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; + } + + switch (subcode) { +- case 0: ++ case DIAG308_RESET_MOD_CLR: + s390_ipl_reset_request(cs, S390_RESET_MODIFIED_CLEAR); + break; +- case 1: ++ case DIAG308_RESET_LOAD_NORM: + s390_ipl_reset_request(cs, S390_RESET_LOAD_NORMAL); + break; +- case 3: ++ case DIAG308_LOAD_CLEAR: ++ /* Well we still lack the clearing bit... */ + s390_ipl_reset_request(cs, S390_RESET_REIPL); + break; +- case 5: +- if ((r1 & 1) || (addr & 0x0fffULL)) { +- s390_program_interrupt(env, PGM_SPECIFICATION, ra); +- return; +- } +- if (!address_space_access_valid(&address_space_memory, addr, +- sizeof(IplParameterBlock), false, +- MEMTXATTRS_UNSPECIFIED)) { +- s390_program_interrupt(env, PGM_ADDRESSING, ra); ++ case DIAG308_SET: ++ if (diag308_parm_check(env, r1, addr, ra, false)) { + return; + } + iplb = g_new0(IplParameterBlock, 1); +@@ -110,15 +127,8 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + out: + g_free(iplb); + return; +- case 6: +- if ((r1 & 1) || (addr & 0x0fffULL)) { +- s390_program_interrupt(env, PGM_SPECIFICATION, ra); +- return; +- } +- if (!address_space_access_valid(&address_space_memory, addr, +- sizeof(IplParameterBlock), true, +- MEMTXATTRS_UNSPECIFIED)) { +- s390_program_interrupt(env, PGM_ADDRESSING, ra); ++ case DIAG308_STORE: ++ if (diag308_parm_check(env, r1, addr, ra, true)) { + return; + } + iplb = s390_ipl_get_iplb(); +-- +2.27.0 + diff --git a/kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch b/kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch new file mode 100755 index 0000000..dab8acc --- /dev/null +++ b/kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch @@ -0,0 +1,52 @@ +From 511638161566d4944a572a31d787eb27bbc0bc8e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:45 -0400 +Subject: [PATCH 03/42] s390x: Don't do a normal reset on the initial cpu +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-4-thuth@redhat.com> +Patchwork-id: 97017 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 03/38] s390x: Don't do a normal reset on the initial cpu +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The initiating cpu needs to be reset with an initial reset. While +doing a normal reset followed by a initial reset is not wrong per se, +the Ultravisor will only allow the correct reset to be performed. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Message-Id: <20191127175046.4911-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit ec9227339fce99412830d44a37eb0bd2fadd5f75) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index c2c83d2fce..4ea01c53c0 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -348,6 +348,9 @@ static void s390_machine_reset(MachineState *machine) + break; + case S390_RESET_LOAD_NORMAL: + CPU_FOREACH(t) { ++ if (t == cs) { ++ continue; ++ } + run_on_cpu(t, s390_do_cpu_reset, RUN_ON_CPU_NULL); + } + subsystem_reset(); +-- +2.27.0 + diff --git a/kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch b/kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch new file mode 100755 index 0000000..9b81586 --- /dev/null +++ b/kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch @@ -0,0 +1,101 @@ +From bdad28b11e36f657cb8909e7223a7d8fc0948c2e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:51 -0400 +Subject: [PATCH 09/42] s390x: Fix cpu normal reset ri clearing +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-10-thuth@redhat.com> +Patchwork-id: 97029 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 09/38] s390x: Fix cpu normal reset ri clearing +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +As it turns out we need to clear the ri controls and PSW enablement +bit to be architecture compliant. + +Signed-off-by: Janosch Frank +Reviewed-by: Christian Borntraeger +Message-Id: <20191203132813.2734-4-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit e893baee70149896d1e43e341da4d6c614037d5d) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 7 ++++++- + target/s390x/cpu.h | 7 ++++++- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index bd39cb54b7..99ea09085a 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -100,7 +100,7 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + case S390_CPU_RESET_INITIAL: + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, +- offsetof(CPUS390XState, end_reset_fields) - ++ offsetof(CPUS390XState, start_normal_reset_fields) - + offsetof(CPUS390XState, start_initial_reset_fields)); + + /* architectured initial value for Breaking-Event-Address register */ +@@ -123,6 +123,11 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + &env->fpu_status); + /* fall through */ + case S390_CPU_RESET_NORMAL: ++ env->psw.mask &= ~PSW_MASK_RI; ++ memset(&env->start_normal_reset_fields, 0, ++ offsetof(CPUS390XState, end_reset_fields) - ++ offsetof(CPUS390XState, start_normal_reset_fields)); ++ + env->pfault_token = -1UL; + env->bpbc = false; + break; +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index d2af13b345..7e1c18d596 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -58,7 +58,6 @@ struct CPUS390XState { + */ + uint64_t vregs[32][2] QEMU_ALIGNED(16); /* vector registers */ + uint32_t aregs[16]; /* access registers */ +- uint8_t riccb[64]; /* runtime instrumentation control */ + uint64_t gscb[4]; /* guarded storage control */ + uint64_t etoken; /* etoken */ + uint64_t etoken_extension; /* etoken extension */ +@@ -114,6 +113,10 @@ struct CPUS390XState { + uint64_t gbea; + uint64_t pp; + ++ /* Fields up to this point are not cleared by normal CPU reset */ ++ struct {} start_normal_reset_fields; ++ uint8_t riccb[64]; /* runtime instrumentation control */ ++ + /* Fields up to this point are cleared by a CPU reset */ + struct {} end_reset_fields; + +@@ -252,6 +255,7 @@ extern const VMStateDescription vmstate_s390_cpu; + #undef PSW_SHIFT_ASC + #undef PSW_MASK_CC + #undef PSW_MASK_PM ++#undef PSW_MASK_RI + #undef PSW_SHIFT_MASK_PM + #undef PSW_MASK_64 + #undef PSW_MASK_32 +@@ -273,6 +277,7 @@ extern const VMStateDescription vmstate_s390_cpu; + #define PSW_MASK_CC 0x0000300000000000ULL + #define PSW_MASK_PM 0x00000F0000000000ULL + #define PSW_SHIFT_MASK_PM 40 ++#define PSW_MASK_RI 0x0000008000000000ULL + #define PSW_MASK_64 0x0000000100000000ULL + #define PSW_MASK_32 0x0000000080000000ULL + #define PSW_MASK_ESA_ADDR 0x000000007fffffffULL +-- +2.27.0 + diff --git a/kvm-s390x-Move-clear-reset.patch b/kvm-s390x-Move-clear-reset.patch new file mode 100755 index 0000000..7c1614c --- /dev/null +++ b/kvm-s390x-Move-clear-reset.patch @@ -0,0 +1,146 @@ +From f268cc7071ecb4322c03f3183acbcf90421da3c7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:48 -0400 +Subject: [PATCH 06/42] s390x: Move clear reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-7-thuth@redhat.com> +Patchwork-id: 97019 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 06/38] s390x: Move clear reset +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's also move the clear reset function into the reset handler. + +Signed-off-by: Janosch Frank +Message-Id: <20191127175046.4911-5-frankja@linux.ibm.com> +Reviewed-by: David Hildenbrand +Reviewed-by: Thomas Huth +Signed-off-by: Cornelia Huck +(cherry picked from commit eb8adcc3e9e3b8405c104ede72cf9f3bb2a5e226) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu-qom.h | 1 + + target/s390x/cpu.c | 58 +++++++++++++----------------------------- + 2 files changed, 18 insertions(+), 41 deletions(-) + +diff --git a/target/s390x/cpu-qom.h b/target/s390x/cpu-qom.h +index 6f0a12042e..dbe5346ec9 100644 +--- a/target/s390x/cpu-qom.h ++++ b/target/s390x/cpu-qom.h +@@ -37,6 +37,7 @@ typedef struct S390CPUDef S390CPUDef; + typedef enum cpu_reset_type { + S390_CPU_RESET_NORMAL, + S390_CPU_RESET_INITIAL, ++ S390_CPU_RESET_CLEAR, + } cpu_reset_type; + + /** +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index ca62fe7685..bd39cb54b7 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -94,6 +94,9 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); + + switch (type) { ++ case S390_CPU_RESET_CLEAR: ++ memset(env, 0, offsetof(CPUS390XState, start_initial_reset_fields)); ++ /* fall through */ + case S390_CPU_RESET_INITIAL: + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, +@@ -107,6 +110,14 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + env->cregs[0] = CR0_RESET; + env->cregs[14] = CR14_RESET; + ++#if defined(CONFIG_USER_ONLY) ++ /* user mode should always be allowed to use the full FPU */ ++ env->cregs[0] |= CR0_AFP; ++ if (s390_has_feat(S390_FEAT_VECTOR)) { ++ env->cregs[0] |= CR0_VECTOR; ++ } ++#endif ++ + /* tininess for underflow is detected before rounding */ + set_float_detect_tininess(float_tininess_before_rounding, + &env->fpu_status); +@@ -125,46 +136,6 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + } + } + +-/* CPUClass:reset() */ +-static void s390_cpu_full_reset(CPUState *s) +-{ +- S390CPU *cpu = S390_CPU(s); +- S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); +- CPUS390XState *env = &cpu->env; +- +- scc->parent_reset(s); +- cpu->env.sigp_order = 0; +- s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); +- +- memset(env, 0, offsetof(CPUS390XState, end_reset_fields)); +- +- /* architectured initial values for CR 0 and 14 */ +- env->cregs[0] = CR0_RESET; +- env->cregs[14] = CR14_RESET; +- +-#if defined(CONFIG_USER_ONLY) +- /* user mode should always be allowed to use the full FPU */ +- env->cregs[0] |= CR0_AFP; +- if (s390_has_feat(S390_FEAT_VECTOR)) { +- env->cregs[0] |= CR0_VECTOR; +- } +-#endif +- +- /* architectured initial value for Breaking-Event-Address register */ +- env->gbea = 1; +- +- env->pfault_token = -1UL; +- +- /* tininess for underflow is detected before rounding */ +- set_float_detect_tininess(float_tininess_before_rounding, +- &env->fpu_status); +- +- /* Reset state inside the kernel that we cannot access yet from QEMU. */ +- if (kvm_enabled()) { +- kvm_s390_reset_vcpu(cpu); +- } +-} +- + #if !defined(CONFIG_USER_ONLY) + static void s390_cpu_machine_reset_cb(void *opaque) + { +@@ -456,6 +427,11 @@ static Property s390x_cpu_properties[] = { + DEFINE_PROP_END_OF_LIST() + }; + ++static void s390_cpu_reset_full(CPUState *s) ++{ ++ return s390_cpu_reset(s, S390_CPU_RESET_CLEAR); ++} ++ + static void s390_cpu_class_init(ObjectClass *oc, void *data) + { + S390CPUClass *scc = S390_CPU_CLASS(oc); +@@ -472,7 +448,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + scc->load_normal = s390_cpu_load_normal; + #endif + scc->reset = s390_cpu_reset; +- cc->reset = s390_cpu_full_reset; ++ cc->reset = s390_cpu_reset_full; + cc->class_by_name = s390_cpu_class_by_name, + cc->has_work = s390_cpu_has_work; + #ifdef CONFIG_TCG +-- +2.27.0 + diff --git a/kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch b/kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch new file mode 100755 index 0000000..ac183cf --- /dev/null +++ b/kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch @@ -0,0 +1,83 @@ +From c9eee8aeed39976293e0d857039fcf729b821e83 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:03 -0400 +Subject: [PATCH 21/42] s390x: Move diagnose 308 subcodes and rcs into ipl.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-22-thuth@redhat.com> +Patchwork-id: 97032 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 21/38] s390x: Move diagnose 308 subcodes and rcs into ipl.h +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +They are part of the IPL process, so let's put them into the ipl +header. + +Signed-off-by: Janosch Frank +Reviewed-by: Cornelia Huck +Reviewed-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Message-Id: <20200319131921.2367-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 9b39d29470e9dbef24ee842a44ea56bd92b855ea) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.h | 11 +++++++++++ + target/s390x/diag.c | 11 ----------- + 2 files changed, 11 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index 3e44abe1c6..a5665e6bfd 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -159,6 +159,17 @@ struct S390IPLState { + typedef struct S390IPLState S390IPLState; + QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of iplb wrong"); + ++#define DIAG_308_RC_OK 0x0001 ++#define DIAG_308_RC_NO_CONF 0x0102 ++#define DIAG_308_RC_INVALID 0x0402 ++ ++#define DIAG308_RESET_MOD_CLR 0 ++#define DIAG308_RESET_LOAD_NORM 1 ++#define DIAG308_LOAD_CLEAR 3 ++#define DIAG308_LOAD_NORMAL_DUMP 4 ++#define DIAG308_SET 5 ++#define DIAG308_STORE 6 ++ + #define S390_IPL_TYPE_FCP 0x00 + #define S390_IPL_TYPE_CCW 0x02 + #define S390_IPL_TYPE_QEMU_SCSI 0xff +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index 54e5670b3f..8aba6341f9 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -49,17 +49,6 @@ int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + return diag288_class->handle_timer(diag288, func, timeout); + } + +-#define DIAG_308_RC_OK 0x0001 +-#define DIAG_308_RC_NO_CONF 0x0102 +-#define DIAG_308_RC_INVALID 0x0402 +- +-#define DIAG308_RESET_MOD_CLR 0 +-#define DIAG308_RESET_LOAD_NORM 1 +-#define DIAG308_LOAD_CLEAR 3 +-#define DIAG308_LOAD_NORMAL_DUMP 4 +-#define DIAG308_SET 5 +-#define DIAG308_STORE 6 +- + static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, + uintptr_t ra, bool write) + { +-- +2.27.0 + diff --git a/kvm-s390x-Move-initial-reset.patch b/kvm-s390x-Move-initial-reset.patch new file mode 100755 index 0000000..0f2e9ab --- /dev/null +++ b/kvm-s390x-Move-initial-reset.patch @@ -0,0 +1,159 @@ +From 0d1c0adf25a323be0663863ebe44a6aefb5f7baf Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:47 -0400 +Subject: [PATCH 05/42] s390x: Move initial reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-6-thuth@redhat.com> +Patchwork-id: 97024 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 05/38] s390x: Move initial reset +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's move the intial reset into the reset handler and cleanup +afterwards. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20191128083723.11937-1-frankja@linux.ibm.com> +Reviewed-by: Thomas Huth +Signed-off-by: Cornelia Huck +(cherry picked from commit 81b9222358e5c8f666f0d86057c75e40531d804c) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu-qom.h | 2 +- + target/s390x/cpu.c | 46 +++++++++++++++++------------------------- + target/s390x/cpu.h | 2 +- + target/s390x/sigp.c | 2 +- + 4 files changed, 21 insertions(+), 31 deletions(-) + +diff --git a/target/s390x/cpu-qom.h b/target/s390x/cpu-qom.h +index f3b71bac67..6f0a12042e 100644 +--- a/target/s390x/cpu-qom.h ++++ b/target/s390x/cpu-qom.h +@@ -36,6 +36,7 @@ typedef struct S390CPUDef S390CPUDef; + + typedef enum cpu_reset_type { + S390_CPU_RESET_NORMAL, ++ S390_CPU_RESET_INITIAL, + } cpu_reset_type; + + /** +@@ -62,7 +63,6 @@ typedef struct S390CPUClass { + void (*parent_reset)(CPUState *cpu); + void (*load_normal)(CPUState *cpu); + void (*reset)(CPUState *cpu, cpu_reset_type type); +- void (*initial_cpu_reset)(CPUState *cpu); + } S390CPUClass; + + typedef struct S390CPU S390CPU; +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 67d6fbfa44..ca62fe7685 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -94,6 +94,23 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); + + switch (type) { ++ case S390_CPU_RESET_INITIAL: ++ /* initial reset does not clear everything! */ ++ memset(&env->start_initial_reset_fields, 0, ++ offsetof(CPUS390XState, end_reset_fields) - ++ offsetof(CPUS390XState, start_initial_reset_fields)); ++ ++ /* architectured initial value for Breaking-Event-Address register */ ++ env->gbea = 1; ++ ++ /* architectured initial values for CR 0 and 14 */ ++ env->cregs[0] = CR0_RESET; ++ env->cregs[14] = CR14_RESET; ++ ++ /* tininess for underflow is detected before rounding */ ++ set_float_detect_tininess(float_tininess_before_rounding, ++ &env->fpu_status); ++ /* fall through */ + case S390_CPU_RESET_NORMAL: + env->pfault_token = -1UL; + env->bpbc = false; +@@ -101,35 +118,9 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + default: + g_assert_not_reached(); + } +-} +- +-/* S390CPUClass::initial_reset() */ +-static void s390_cpu_initial_reset(CPUState *s) +-{ +- S390CPU *cpu = S390_CPU(s); +- CPUS390XState *env = &cpu->env; +- +- s390_cpu_reset(s, S390_CPU_RESET_NORMAL); +- /* initial reset does not clear everything! */ +- memset(&env->start_initial_reset_fields, 0, +- offsetof(CPUS390XState, end_reset_fields) - +- offsetof(CPUS390XState, start_initial_reset_fields)); +- +- /* architectured initial values for CR 0 and 14 */ +- env->cregs[0] = CR0_RESET; +- env->cregs[14] = CR14_RESET; +- +- /* architectured initial value for Breaking-Event-Address register */ +- env->gbea = 1; +- +- env->pfault_token = -1UL; +- +- /* tininess for underflow is detected before rounding */ +- set_float_detect_tininess(float_tininess_before_rounding, +- &env->fpu_status); + + /* Reset state inside the kernel that we cannot access yet from QEMU. */ +- if (kvm_enabled()) { ++ if (kvm_enabled() && type != S390_CPU_RESET_NORMAL) { + kvm_s390_reset_vcpu(cpu); + } + } +@@ -481,7 +472,6 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + scc->load_normal = s390_cpu_load_normal; + #endif + scc->reset = s390_cpu_reset; +- scc->initial_cpu_reset = s390_cpu_initial_reset; + cc->reset = s390_cpu_full_reset; + cc->class_by_name = s390_cpu_class_by_name, + cc->has_work = s390_cpu_has_work; +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 18123dfd5b..d2af13b345 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -748,7 +748,7 @@ static inline void s390_do_cpu_initial_reset(CPUState *cs, run_on_cpu_data arg) + { + S390CPUClass *scc = S390_CPU_GET_CLASS(cs); + +- scc->initial_cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_INITIAL); + } + + static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg) +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index 850139b9cd..727875bb4a 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -254,7 +254,7 @@ static void sigp_initial_cpu_reset(CPUState *cs, run_on_cpu_data arg) + SigpInfo *si = arg.host_ptr; + + cpu_synchronize_state(cs); +- scc->initial_cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_INITIAL); + cpu_synchronize_post_reset(cs); + si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } +-- +2.27.0 + diff --git a/kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch b/kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch new file mode 100755 index 0000000..81a4368 --- /dev/null +++ b/kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch @@ -0,0 +1,145 @@ +From 53b5a7f83f3e6b94c66cbbb97ea42bbf02cb96b4 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:46 -0400 +Subject: [PATCH 04/42] s390x: Move reset normal to shared reset handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-5-thuth@redhat.com> +Patchwork-id: 97018 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 04/38] s390x: Move reset normal to shared reset handler +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's start moving the cpu reset functions into a single function with +a switch/case, so we can later use fallthroughs and share more code +between resets. + +This patch introduces the reset function by renaming cpu_reset(). + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20191127175046.4911-3-frankja@linux.ibm.com> +Reviewed-by: Thomas Huth +Signed-off-by: Cornelia Huck +(cherry picked from commit eac4f82791f1807c423e85670837db103b9d59b3) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu-qom.h | 6 +++++- + target/s390x/cpu.c | 19 +++++++++++++------ + target/s390x/cpu.h | 2 +- + target/s390x/sigp.c | 2 +- + 4 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/target/s390x/cpu-qom.h b/target/s390x/cpu-qom.h +index b809ec8418..f3b71bac67 100644 +--- a/target/s390x/cpu-qom.h ++++ b/target/s390x/cpu-qom.h +@@ -34,6 +34,10 @@ + typedef struct S390CPUModel S390CPUModel; + typedef struct S390CPUDef S390CPUDef; + ++typedef enum cpu_reset_type { ++ S390_CPU_RESET_NORMAL, ++} cpu_reset_type; ++ + /** + * S390CPUClass: + * @parent_realize: The parent class' realize handler. +@@ -57,7 +61,7 @@ typedef struct S390CPUClass { + DeviceRealize parent_realize; + void (*parent_reset)(CPUState *cpu); + void (*load_normal)(CPUState *cpu); +- void (*cpu_reset)(CPUState *cpu); ++ void (*reset)(CPUState *cpu, cpu_reset_type type); + void (*initial_cpu_reset)(CPUState *cpu); + } S390CPUClass; + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 3abe7e80fd..67d6fbfa44 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -82,18 +82,25 @@ static void s390_cpu_load_normal(CPUState *s) + } + #endif + +-/* S390CPUClass::cpu_reset() */ +-static void s390_cpu_reset(CPUState *s) ++/* S390CPUClass::reset() */ ++static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + { + S390CPU *cpu = S390_CPU(s); + S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); + CPUS390XState *env = &cpu->env; + +- env->pfault_token = -1UL; +- env->bpbc = false; + scc->parent_reset(s); + cpu->env.sigp_order = 0; + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); ++ ++ switch (type) { ++ case S390_CPU_RESET_NORMAL: ++ env->pfault_token = -1UL; ++ env->bpbc = false; ++ break; ++ default: ++ g_assert_not_reached(); ++ } + } + + /* S390CPUClass::initial_reset() */ +@@ -102,7 +109,7 @@ static void s390_cpu_initial_reset(CPUState *s) + S390CPU *cpu = S390_CPU(s); + CPUS390XState *env = &cpu->env; + +- s390_cpu_reset(s); ++ s390_cpu_reset(s, S390_CPU_RESET_NORMAL); + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, + offsetof(CPUS390XState, end_reset_fields) - +@@ -473,7 +480,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + #if !defined(CONFIG_USER_ONLY) + scc->load_normal = s390_cpu_load_normal; + #endif +- scc->cpu_reset = s390_cpu_reset; ++ scc->reset = s390_cpu_reset; + scc->initial_cpu_reset = s390_cpu_initial_reset; + cc->reset = s390_cpu_full_reset; + cc->class_by_name = s390_cpu_class_by_name, +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 17460ed7b3..18123dfd5b 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -741,7 +741,7 @@ static inline void s390_do_cpu_reset(CPUState *cs, run_on_cpu_data arg) + { + S390CPUClass *scc = S390_CPU_GET_CLASS(cs); + +- scc->cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_NORMAL); + } + + static inline void s390_do_cpu_initial_reset(CPUState *cs, run_on_cpu_data arg) +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index 2ce22d4dc1..850139b9cd 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -266,7 +266,7 @@ static void sigp_cpu_reset(CPUState *cs, run_on_cpu_data arg) + SigpInfo *si = arg.host_ptr; + + cpu_synchronize_state(cs); +- scc->cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_NORMAL); + cpu_synchronize_post_reset(cs); + si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } +-- +2.27.0 + diff --git a/kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch b/kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch new file mode 100755 index 0000000..9447240 --- /dev/null +++ b/kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch @@ -0,0 +1,70 @@ +From 7171a794e8a7d91805516174187addc3b8e6b423 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:54 -0400 +Subject: [PATCH 12/42] s390x: Properly fetch and test the short psw on diag308 + subc 0/1 + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-13-thuth@redhat.com> +Patchwork-id: 97025 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 12/38] s390x: Properly fetch and test the short psw on diag308 subc 0/1 +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +We need to actually fetch the cpu mask and set it. As we invert the +short psw indication in the mask, SIE will report a specification +exception, if it wasn't present in the reset psw. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20191129142025.21453-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 104130cb7c106378dab944397c6a455c4a6d552f) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 12 ++++++++++-- + target/s390x/cpu.h | 1 + + 2 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 99ea09085a..625daeedd1 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -76,8 +76,16 @@ static bool s390_cpu_has_work(CPUState *cs) + static void s390_cpu_load_normal(CPUState *s) + { + S390CPU *cpu = S390_CPU(s); +- cpu->env.psw.addr = ldl_phys(s->as, 4) & PSW_MASK_ESA_ADDR; +- cpu->env.psw.mask = PSW_MASK_32 | PSW_MASK_64; ++ uint64_t spsw = ldq_phys(s->as, 0); ++ ++ cpu->env.psw.mask = spsw & 0xffffffff80000000ULL; ++ /* ++ * Invert short psw indication, so SIE will report a specification ++ * exception if it was not set. ++ */ ++ cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; ++ cpu->env.psw.addr = spsw & 0x7fffffffULL; ++ + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } + #endif +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 7e1c18d596..7f5fa1d35b 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -269,6 +269,7 @@ extern const VMStateDescription vmstate_s390_cpu; + #define PSW_MASK_EXT 0x0100000000000000ULL + #define PSW_MASK_KEY 0x00F0000000000000ULL + #define PSW_SHIFT_KEY 52 ++#define PSW_MASK_SHORTPSW 0x0008000000000000ULL + #define PSW_MASK_MCHECK 0x0004000000000000ULL + #define PSW_MASK_WAIT 0x0002000000000000ULL + #define PSW_MASK_PSTATE 0x0001000000000000ULL +-- +2.27.0 + diff --git a/kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch b/kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch new file mode 100755 index 0000000..b1c7e01 --- /dev/null +++ b/kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch @@ -0,0 +1,87 @@ +From 4bd5ae889376816238ecad1bce054b0e198cde2b Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:55 -0400 +Subject: [PATCH 13/42] s390x: Rename and use constants for short PSW address + and mask + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-14-thuth@redhat.com> +Patchwork-id: 97050 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 13/38] s390x: Rename and use constants for short PSW address and mask +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's rename PSW_MASK_ESA_ADDR to PSW_MASK_SHORT_ADDR because we're +not working with a ESA PSW which would not support the extended +addressing bit. Also let's actually use it. + +Additionally we introduce PSW_MASK_SHORT_CTRL and use it throughout +the codebase. + +Signed-off-by: Janosch Frank +Reviewed-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Message-Id: <20200227092341.38558-1-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b6c2dbd7214b0b2396e1dcf9668c8b48ab571115) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 2 +- + target/s390x/cpu.c | 4 ++-- + target/s390x/cpu.h | 3 ++- + 3 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index ca544d64c5..0b7548a549 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -179,7 +179,7 @@ static void s390_ipl_realize(DeviceState *dev, Error **errp) + /* if not Linux load the address of the (short) IPL PSW */ + ipl_psw = rom_ptr(4, 4); + if (ipl_psw) { +- pentry = be32_to_cpu(*ipl_psw) & 0x7fffffffUL; ++ pentry = be32_to_cpu(*ipl_psw) & PSW_MASK_SHORT_ADDR; + } else { + error_setg(&err, "Could not get IPL PSW"); + goto error; +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 625daeedd1..e538a4a3e2 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -78,13 +78,13 @@ static void s390_cpu_load_normal(CPUState *s) + S390CPU *cpu = S390_CPU(s); + uint64_t spsw = ldq_phys(s->as, 0); + +- cpu->env.psw.mask = spsw & 0xffffffff80000000ULL; ++ cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL; + /* + * Invert short psw indication, so SIE will report a specification + * exception if it was not set. + */ + cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; +- cpu->env.psw.addr = spsw & 0x7fffffffULL; ++ cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR; + + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 7f5fa1d35b..1ff84e6b3a 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -281,7 +281,8 @@ extern const VMStateDescription vmstate_s390_cpu; + #define PSW_MASK_RI 0x0000008000000000ULL + #define PSW_MASK_64 0x0000000100000000ULL + #define PSW_MASK_32 0x0000000080000000ULL +-#define PSW_MASK_ESA_ADDR 0x000000007fffffffULL ++#define PSW_MASK_SHORT_ADDR 0x000000007fffffffULL ++#define PSW_MASK_SHORT_CTRL 0xffffffff80000000ULL + + #undef PSW_ASC_PRIMARY + #undef PSW_ASC_ACCREG +-- +2.27.0 + diff --git a/kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch b/kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch new file mode 100755 index 0000000..8ce7625 --- /dev/null +++ b/kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch @@ -0,0 +1,119 @@ +From 04d4e7eda95316b64ea9dc0f4ca8801d531652e7 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:41 -0400 +Subject: [PATCH 07/12] s390x/css: Refactor the css_queue_crw() routine + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-8-cohuck@redhat.com> +Patchwork-id: 97700 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 7/9] s390x/css: Refactor the css_queue_crw() routine +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Eric Farman + +We have a use case (vfio-ccw) where a CRW is already built and +ready to use. Rather than teasing out the components just to +reassemble it later, let's rework this code so we can queue a +fully-qualified CRW directly. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-6-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f6dde1b012e678aa64339520ef7519ec04026cf1) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/css.c | 44 ++++++++++++++++++++++++++++-------------- + include/hw/s390x/css.h | 1 + + 2 files changed, 30 insertions(+), 15 deletions(-) + +diff --git a/hw/s390x/css.c b/hw/s390x/css.c +index 71fd3f9a00..a8de8a0c84 100644 +--- a/hw/s390x/css.c ++++ b/hw/s390x/css.c +@@ -2170,30 +2170,23 @@ void css_subch_assign(uint8_t cssid, uint8_t ssid, uint16_t schid, + } + } + +-void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, +- int chain, uint16_t rsid) ++void css_crw_add_to_queue(CRW crw) + { + CrwContainer *crw_cont; + +- trace_css_crw(rsc, erc, rsid, chain ? "(chained)" : ""); ++ trace_css_crw((crw.flags & CRW_FLAGS_MASK_RSC) >> 8, ++ crw.flags & CRW_FLAGS_MASK_ERC, ++ crw.rsid, ++ (crw.flags & CRW_FLAGS_MASK_C) ? "(chained)" : ""); ++ + /* TODO: Maybe use a static crw pool? */ + crw_cont = g_try_new0(CrwContainer, 1); + if (!crw_cont) { + channel_subsys.crws_lost = true; + return; + } +- crw_cont->crw.flags = (rsc << 8) | erc; +- if (solicited) { +- crw_cont->crw.flags |= CRW_FLAGS_MASK_S; +- } +- if (chain) { +- crw_cont->crw.flags |= CRW_FLAGS_MASK_C; +- } +- crw_cont->crw.rsid = rsid; +- if (channel_subsys.crws_lost) { +- crw_cont->crw.flags |= CRW_FLAGS_MASK_R; +- channel_subsys.crws_lost = false; +- } ++ ++ crw_cont->crw = crw; + + QTAILQ_INSERT_TAIL(&channel_subsys.pending_crws, crw_cont, sibling); + +@@ -2204,6 +2197,27 @@ void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, + } + } + ++void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, ++ int chain, uint16_t rsid) ++{ ++ CRW crw; ++ ++ crw.flags = (rsc << 8) | erc; ++ if (solicited) { ++ crw.flags |= CRW_FLAGS_MASK_S; ++ } ++ if (chain) { ++ crw.flags |= CRW_FLAGS_MASK_C; ++ } ++ crw.rsid = rsid; ++ if (channel_subsys.crws_lost) { ++ crw.flags |= CRW_FLAGS_MASK_R; ++ channel_subsys.crws_lost = false; ++ } ++ ++ css_crw_add_to_queue(crw); ++} ++ + void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid, + int hotplugged, int add) + { +diff --git a/include/hw/s390x/css.h b/include/hw/s390x/css.h +index 7e3a5e7433..08c869ab0a 100644 +--- a/include/hw/s390x/css.h ++++ b/include/hw/s390x/css.h +@@ -205,6 +205,7 @@ void copy_scsw_to_guest(SCSW *dest, const SCSW *src); + void css_inject_io_interrupt(SubchDev *sch); + void css_reset(void); + void css_reset_sch(SubchDev *sch); ++void css_crw_add_to_queue(CRW crw); + void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, + int chain, uint16_t rsid); + void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid, +-- +2.27.0 + diff --git a/kvm-s390x-fix-build-for-without-default-devices.patch b/kvm-s390x-fix-build-for-without-default-devices.patch new file mode 100755 index 0000000..6567c04 --- /dev/null +++ b/kvm-s390x-fix-build-for-without-default-devices.patch @@ -0,0 +1,74 @@ +From d86158eeb752242791e3f94172ed020204040250 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:46 -0500 +Subject: [PATCH 7/7] s390x: fix build for --without-default-devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-8-cohuck@redhat.com> +Patchwork-id: 100681 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 7/7] s390x: fix build for --without-default-devices +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +s390-pci-vfio.c calls into the vfio code, so we need it to be +built conditionally on vfio (which implies CONFIG_LINUX). + +Fixes: cd7498d07fbb ("s390x/pci: Add routine to get the vfio dma available count") +Reported-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Matthew Rosato +Message-Id: <20201103123237.718242-1-cohuck@redhat.com> +Acked-by: Greg Kurz +Tested-by: Greg Kurz +Signed-off-by: Cornelia Huck +(cherry picked from commit 77280d33bc9cfdbfb5b5d462259d644f5aefe9b3) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/meson.build + include/hw/s390x/s390-pci-vfio.h + --> adaptions due to missing Meson rework + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/Makefile.objs | 2 +- + include/hw/s390x/s390-pci-vfio.h | 3 ++- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index 43756c9437d..dbef4b8906c 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -7,7 +7,7 @@ obj-y += ipl.o + obj-y += css.o + obj-$(CONFIG_S390_CCW_VIRTIO) += s390-virtio-ccw.o + obj-$(CONFIG_TERMINAL3270) += 3270-ccw.o +-obj-$(CONFIG_LINUX) += s390-pci-vfio.o ++obj-$(CONFIG_VFIO) += s390-pci-vfio.o + ifeq ($(CONFIG_VIRTIO_CCW),y) + obj-y += virtio-ccw.o + obj-$(CONFIG_VIRTIO_SERIAL) += virtio-ccw-serial.o +diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h +index 539bcf04eb5..685b136d46b 100644 +--- a/include/hw/s390x/s390-pci-vfio.h ++++ b/include/hw/s390x/s390-pci-vfio.h +@@ -13,8 +13,9 @@ + #define HW_S390_PCI_VFIO_H + + #include "hw/s390x/s390-pci-bus.h" ++#include "config-devices.h" + +-#ifdef CONFIG_LINUX ++#ifdef CONFIG_VFIO + bool s390_pci_update_dma_avail(int fd, unsigned int *avail); + S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev); +-- +2.27.0 + diff --git a/kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch b/kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch new file mode 100755 index 0000000..8b9294e --- /dev/null +++ b/kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch @@ -0,0 +1,82 @@ +From 536b6081c0739bebbb33583370f62116d0cb42da Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:01 -0400 +Subject: [PATCH 19/42] s390x: ipl: Consolidate iplb validity check into one + function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-20-thuth@redhat.com> +Patchwork-id: 97038 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 19/38] s390x: ipl: Consolidate iplb validity check into one function +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +It's nicer to just call one function than calling a function for each +possible iplb type. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20200310090950.61172-1-frankja@linux.ibm.com> +Reviewed-by: Christian Borntraeger +Signed-off-by: Christian Borntraeger +(cherry picked from commit 94c21436e5a89143f8b9cb4d089d1a2f3f4fd377) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.h | 18 +++++++++--------- + target/s390x/diag.c | 2 +- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index d4813105db..3e44abe1c6 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -173,16 +173,16 @@ static inline bool iplb_valid_len(IplParameterBlock *iplb) + return be32_to_cpu(iplb->len) <= sizeof(IplParameterBlock); + } + +-static inline bool iplb_valid_ccw(IplParameterBlock *iplb) ++static inline bool iplb_valid(IplParameterBlock *iplb) + { +- return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_CCW_LEN && +- iplb->pbt == S390_IPL_TYPE_CCW; +-} +- +-static inline bool iplb_valid_fcp(IplParameterBlock *iplb) +-{ +- return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_FCP_LEN && +- iplb->pbt == S390_IPL_TYPE_FCP; ++ switch (iplb->pbt) { ++ case S390_IPL_TYPE_FCP: ++ return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_FCP_LEN; ++ case S390_IPL_TYPE_CCW: ++ return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_CCW_LEN; ++ default: ++ return false; ++ } + } + + #endif +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index b5aec06d6b..54e5670b3f 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -117,7 +117,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + + cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); + +- if (!iplb_valid_ccw(iplb) && !iplb_valid_fcp(iplb)) { ++ if (!iplb_valid(iplb)) { + env->regs[r1 + 1] = DIAG_308_RC_INVALID; + goto out; + } +-- +2.27.0 + diff --git a/kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch b/kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch new file mode 100755 index 0000000..9882324 --- /dev/null +++ b/kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch @@ -0,0 +1,83 @@ +From 999cf62d870ff9aa8e9609fcbbcefef9ae1aceb6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:50 -0400 +Subject: [PATCH 08/42] s390x: kvm: Make kvm_sclp_service_call void +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-9-thuth@redhat.com> +Patchwork-id: 97030 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 08/38] s390x: kvm: Make kvm_sclp_service_call void +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +It defaults to returning 0 anyway and that return value is not +necessary, as 0 is also the default rc that the caller would return. + +While doing that we can simplify the logic a bit and return early if +we inject a PGM exception. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Message-Id: <20191129091713.4582-1-frankja@linux.ibm.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Cornelia Huck +(cherry picked from commit 15b6c0370c3e2774fd9ffda5c10c6e36952e8eb6) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index a02d569537..1c5bc7a2f9 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -1159,13 +1159,13 @@ void kvm_s390_access_exception(S390CPU *cpu, uint16_t code, uint64_t te_code) + kvm_s390_vcpu_interrupt(cpu, &irq); + } + +-static int kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, ++static void kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, + uint16_t ipbh0) + { + CPUS390XState *env = &cpu->env; + uint64_t sccb; + uint32_t code; +- int r = 0; ++ int r; + + sccb = env->regs[ipbh0 & 0xf]; + code = env->regs[(ipbh0 & 0xf0) >> 4]; +@@ -1173,11 +1173,9 @@ static int kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, + r = sclp_service_call(env, sccb, code); + if (r < 0) { + kvm_s390_program_interrupt(cpu, -r); +- } else { +- setcc(cpu, r); ++ return; + } +- +- return 0; ++ setcc(cpu, r); + } + + static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1) +@@ -1240,7 +1238,7 @@ static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1) + setcc(cpu, 3); + break; + case PRIV_B2_SCLP_CALL: +- rc = kvm_sclp_service_call(cpu, run, ipbh0); ++ kvm_sclp_service_call(cpu, run, ipbh0); + break; + default: + rc = -1; +-- +2.27.0 + diff --git a/kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch b/kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch new file mode 100755 index 0000000..5e48efb --- /dev/null +++ b/kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch @@ -0,0 +1,150 @@ +From 3927f54a56e29003b84e0e3726d3a0170681128b Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:44 -0500 +Subject: [PATCH 5/7] s390x/pci: Add routine to get the vfio dma available + count + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-6-cohuck@redhat.com> +Patchwork-id: 100679 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 5/7] s390x/pci: Add routine to get the vfio dma available count +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +Create new files for separating out vfio-specific work for s390 +pci. Add the first such routine, which issues VFIO_IOMMU_GET_INFO +ioctl to collect the current dma available count. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +[aw: Fix non-Linux build with CONFIG_LINUX] +Signed-off-by: Alex Williamson +(cherry picked from commit cd7498d07fbb20fa04790ff7ee168a8a8d01cb30) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/meson.build + --> added the file in hw/s390x/Makefile.objs instead, + since we do not use Meson yet + hw/s390x/s390-pci-vfio.c + --> NULL-initialize "info" to avoid a downstream-only + compiler warning + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/Makefile.objs | 1 + + hw/s390x/s390-pci-vfio.c | 54 ++++++++++++++++++++++++++++++++ + include/hw/s390x/s390-pci-vfio.h | 24 ++++++++++++++ + 3 files changed, 79 insertions(+) + create mode 100644 hw/s390x/s390-pci-vfio.c + create mode 100644 include/hw/s390x/s390-pci-vfio.h + +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index c4086ec3171..43756c9437d 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -7,6 +7,7 @@ obj-y += ipl.o + obj-y += css.o + obj-$(CONFIG_S390_CCW_VIRTIO) += s390-virtio-ccw.o + obj-$(CONFIG_TERMINAL3270) += 3270-ccw.o ++obj-$(CONFIG_LINUX) += s390-pci-vfio.o + ifeq ($(CONFIG_VIRTIO_CCW),y) + obj-y += virtio-ccw.o + obj-$(CONFIG_VIRTIO_SERIAL) += virtio-ccw-serial.o +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +new file mode 100644 +index 00000000000..0eb22ffec4c +--- /dev/null ++++ b/hw/s390x/s390-pci-vfio.c +@@ -0,0 +1,54 @@ ++/* ++ * s390 vfio-pci interfaces ++ * ++ * Copyright 2020 IBM Corp. ++ * Author(s): Matthew Rosato ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++ ++#include ++ ++#include "qemu/osdep.h" ++#include "hw/s390x/s390-pci-vfio.h" ++#include "hw/vfio/vfio-common.h" ++ ++/* ++ * Get the current DMA available count from vfio. Returns true if vfio is ++ * limiting DMA requests, false otherwise. The current available count read ++ * from vfio is returned in avail. ++ */ ++bool s390_pci_update_dma_avail(int fd, unsigned int *avail) ++{ ++ g_autofree struct vfio_iommu_type1_info *info = NULL; ++ uint32_t argsz; ++ ++ assert(avail); ++ ++ argsz = sizeof(struct vfio_iommu_type1_info); ++ info = g_malloc0(argsz); ++ ++ /* ++ * If the specified argsz is not large enough to contain all capabilities ++ * it will be updated upon return from the ioctl. Retry until we have ++ * a big enough buffer to hold the entire capability chain. ++ */ ++retry: ++ info->argsz = argsz; ++ ++ if (ioctl(fd, VFIO_IOMMU_GET_INFO, info)) { ++ return false; ++ } ++ ++ if (info->argsz > argsz) { ++ argsz = info->argsz; ++ info = g_realloc(info, argsz); ++ goto retry; ++ } ++ ++ /* If the capability exists, update with the current value */ ++ return vfio_get_info_dma_avail(info, avail); ++} ++ +diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h +new file mode 100644 +index 00000000000..1727292e9b5 +--- /dev/null ++++ b/include/hw/s390x/s390-pci-vfio.h +@@ -0,0 +1,24 @@ ++/* ++ * s390 vfio-pci interfaces ++ * ++ * Copyright 2020 IBM Corp. ++ * Author(s): Matthew Rosato ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++ ++#ifndef HW_S390_PCI_VFIO_H ++#define HW_S390_PCI_VFIO_H ++ ++#ifdef CONFIG_LINUX ++bool s390_pci_update_dma_avail(int fd, unsigned int *avail); ++#else ++static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) ++{ ++ return false; ++} ++#endif ++ ++#endif +-- +2.27.0 + diff --git a/kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch b/kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch new file mode 100755 index 0000000..13fd6b7 --- /dev/null +++ b/kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch @@ -0,0 +1,357 @@ +From 7ef9b9c593da98ad32ad20c28d17bb2700a35c29 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:45 -0500 +Subject: [PATCH 6/7] s390x/pci: Honor DMA limits set by vfio + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-7-cohuck@redhat.com> +Patchwork-id: 100680 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 6/7] s390x/pci: Honor DMA limits set by vfio +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +When an s390 guest is using lazy unmapping, it can result in a very +large number of oustanding DMA requests, far beyond the default +limit configured for vfio. Let's track DMA usage similar to vfio +in the host, and trigger the guest to flush their DMA mappings +before vfio runs out. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +[aw: non-Linux build fixes] +Signed-off-by: Alex Williamson +(cherry picked from commit 37fa32de707340f3a93959ad5a1ebc41ba1520ee) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/s390-pci-bus.c + --> adapt to missing 981c3dcd9489 ("qdev: Convert to + qdev_unrealize() with Coccinelle") + hw/s390x/s390-pci-inst.c + --> adapt to out of order inclusion of 5039caf3c449 ("memory: + Add IOMMUTLBEvent") + include/hw/s390x/s390-pci-bus.h + --> adapt to missing db1015e92e04 ("Move QOM typedefs and + add missing includes") + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-pci-bus.c | 16 ++++++++---- + hw/s390x/s390-pci-inst.c | 45 +++++++++++++++++++++++++++----- + hw/s390x/s390-pci-vfio.c | 42 +++++++++++++++++++++++++++++ + include/hw/s390x/s390-pci-bus.h | 9 +++++++ + include/hw/s390x/s390-pci-inst.h | 3 +++ + include/hw/s390x/s390-pci-vfio.h | 12 +++++++++ + 6 files changed, 116 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c +index 6daef2b6d57..a9f6f550472 100644 +--- a/hw/s390x/s390-pci-bus.c ++++ b/hw/s390x/s390-pci-bus.c +@@ -17,6 +17,7 @@ + #include "cpu.h" + #include "hw/s390x/s390-pci-bus.h" + #include "hw/s390x/s390-pci-inst.h" ++#include "hw/s390x/s390-pci-vfio.h" + #include "hw/pci/pci_bus.h" + #include "hw/qdev-properties.h" + #include "hw/pci/pci_bridge.h" +@@ -771,6 +772,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) + s->bus_no = 0; + QTAILQ_INIT(&s->pending_sei); + QTAILQ_INIT(&s->zpci_devs); ++ QTAILQ_INIT(&s->zpci_dma_limit); + + css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, + S390_ADAPTER_SUPPRESSIBLE, &local_err); +@@ -951,17 +953,18 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + } + } + ++ pbdev->pdev = pdev; ++ pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); ++ pbdev->iommu->pbdev = pbdev; ++ pbdev->state = ZPCI_FS_DISABLED; ++ + if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { + pbdev->fh |= FH_SHM_VFIO; ++ pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); + } else { + pbdev->fh |= FH_SHM_EMUL; + } + +- pbdev->pdev = pdev; +- pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); +- pbdev->iommu->pbdev = pbdev; +- pbdev->state = ZPCI_FS_DISABLED; +- + if (s390_pci_msix_init(pbdev)) { + error_setg(errp, "MSI-X support is mandatory " + "in the S390 architecture"); +@@ -1014,6 +1017,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, + pbdev->fid = 0; + QTAILQ_REMOVE(&s->zpci_devs, pbdev, link); + g_hash_table_remove(s->zpci_table, &pbdev->idx); ++ if (pbdev->iommu->dma_limit) { ++ s390_pci_end_dma_count(s, pbdev->iommu->dma_limit); ++ } + object_property_set_bool(OBJECT(dev), false, "realized", NULL); + } + } +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index b1885344f18..edbdf727984 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -32,6 +32,20 @@ + } \ + } while (0) + ++static inline void inc_dma_avail(S390PCIIOMMU *iommu) ++{ ++ if (iommu->dma_limit) { ++ iommu->dma_limit->avail++; ++ } ++} ++ ++static inline void dec_dma_avail(S390PCIIOMMU *iommu) ++{ ++ if (iommu->dma_limit) { ++ iommu->dma_limit->avail--; ++ } ++} ++ + static void s390_set_status_code(CPUS390XState *env, + uint8_t r, uint64_t status_code) + { +@@ -572,7 +586,8 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + return 0; + } + +-static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) ++static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu, ++ S390IOTLBEntry *entry) + { + S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); + IOMMUTLBEvent event = { +@@ -588,14 +603,15 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + + if (event.type == IOMMU_NOTIFIER_UNMAP) { + if (!cache) { +- return; ++ goto out; + } + g_hash_table_remove(iommu->iotlb, &entry->iova); ++ inc_dma_avail(iommu); + } else { + if (cache) { + if (cache->perm == entry->perm && + cache->translated_addr == entry->translated_addr) { +- return; ++ goto out; + } + + event.type = IOMMU_NOTIFIER_UNMAP; +@@ -611,9 +627,13 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + cache->len = PAGE_SIZE; + cache->perm = entry->perm; + g_hash_table_replace(iommu->iotlb, &cache->iova, cache); ++ dec_dma_avail(iommu); + } + + memory_region_notify_iommu(&iommu->iommu_mr, 0, event); ++ ++out: ++ return iommu->dma_limit ? iommu->dma_limit->avail : 1; + } + + int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) +@@ -625,6 +645,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + S390PCIIOMMU *iommu; + S390IOTLBEntry entry; + hwaddr start, end; ++ uint32_t dma_avail; + + if (env->psw.mask & PSW_MASK_PSTATE) { + s390_program_interrupt(env, PGM_PRIVILEGED, ra); +@@ -663,6 +684,11 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + } + + iommu = pbdev->iommu; ++ if (iommu->dma_limit) { ++ dma_avail = iommu->dma_limit->avail; ++ } else { ++ dma_avail = 1; ++ } + if (!iommu->g_iota) { + error = ERR_EVENT_INVALAS; + goto err; +@@ -680,8 +706,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + } + + start += entry.len; +- while (entry.iova < start && entry.iova < end) { +- s390_pci_update_iotlb(iommu, &entry); ++ while (entry.iova < start && entry.iova < end && ++ (dma_avail > 0 || entry.perm == IOMMU_NONE)) { ++ dma_avail = s390_pci_update_iotlb(iommu, &entry); + entry.iova += PAGE_SIZE; + entry.translated_addr += PAGE_SIZE; + } +@@ -694,7 +721,13 @@ err: + s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0); + } else { + pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++; +- setcc(cpu, ZPCI_PCI_LS_OK); ++ if (dma_avail > 0) { ++ setcc(cpu, ZPCI_PCI_LS_OK); ++ } else { ++ /* vfio DMA mappings are exhausted, trigger a RPCIT */ ++ setcc(cpu, ZPCI_PCI_LS_ERR); ++ s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES); ++ } + } + return 0; + } +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +index 0eb22ffec4c..01c1e8ac89a 100644 +--- a/hw/s390x/s390-pci-vfio.c ++++ b/hw/s390x/s390-pci-vfio.c +@@ -12,7 +12,9 @@ + #include + + #include "qemu/osdep.h" ++#include "hw/s390x/s390-pci-bus.h" + #include "hw/s390x/s390-pci-vfio.h" ++#include "hw/vfio/pci.h" + #include "hw/vfio/vfio-common.h" + + /* +@@ -52,3 +54,43 @@ retry: + return vfio_get_info_dma_avail(info, avail); + } + ++S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, ++ S390PCIBusDevice *pbdev) ++{ ++ S390PCIDMACount *cnt; ++ uint32_t avail; ++ VFIOPCIDevice *vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); ++ int id; ++ ++ assert(vpdev); ++ ++ id = vpdev->vbasedev.group->container->fd; ++ ++ if (!s390_pci_update_dma_avail(id, &avail)) { ++ return NULL; ++ } ++ ++ QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) { ++ if (cnt->id == id) { ++ cnt->users++; ++ return cnt; ++ } ++ } ++ ++ cnt = g_new0(S390PCIDMACount, 1); ++ cnt->id = id; ++ cnt->users = 1; ++ cnt->avail = avail; ++ QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link); ++ return cnt; ++} ++ ++void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) ++{ ++ assert(cnt); ++ ++ cnt->users--; ++ if (cnt->users == 0) { ++ QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); ++ } ++} +diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h +index 550f3cc5e92..2f2edbd0bf3 100644 +--- a/include/hw/s390x/s390-pci-bus.h ++++ b/include/hw/s390x/s390-pci-bus.h +@@ -266,6 +266,13 @@ typedef struct S390IOTLBEntry { + } S390IOTLBEntry; + + typedef struct S390PCIBusDevice S390PCIBusDevice; ++typedef struct S390PCIDMACount { ++ int id; ++ int users; ++ uint32_t avail; ++ QTAILQ_ENTRY(S390PCIDMACount) link; ++} S390PCIDMACount; ++ + typedef struct S390PCIIOMMU { + Object parent_obj; + S390PCIBusDevice *pbdev; +@@ -277,6 +284,7 @@ typedef struct S390PCIIOMMU { + uint64_t pba; + uint64_t pal; + GHashTable *iotlb; ++ S390PCIDMACount *dma_limit; + } S390PCIIOMMU; + + typedef struct S390PCIIOMMUTable { +@@ -352,6 +360,7 @@ typedef struct S390pciState { + GHashTable *zpci_table; + QTAILQ_HEAD(, SeiContainer) pending_sei; + QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs; ++ QTAILQ_HEAD(, S390PCIDMACount) zpci_dma_limit; + } S390pciState; + + S390pciState *s390_get_phb(void); +diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h +index fa3bf8b5aad..8ee3a3c2375 100644 +--- a/include/hw/s390x/s390-pci-inst.h ++++ b/include/hw/s390x/s390-pci-inst.h +@@ -254,6 +254,9 @@ typedef struct ClpReqRspQueryPciGrp { + #define ZPCI_STPCIFC_ST_INVAL_DMAAS 28 + #define ZPCI_STPCIFC_ST_ERROR_RECOVER 40 + ++/* Refresh PCI Translations status codes */ ++#define ZPCI_RPCIT_ST_INSUFF_RES 16 ++ + /* FIB function controls */ + #define ZPCI_FIB_FC_ENABLED 0x80 + #define ZPCI_FIB_FC_ERROR 0x40 +diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h +index 1727292e9b5..539bcf04eb5 100644 +--- a/include/hw/s390x/s390-pci-vfio.h ++++ b/include/hw/s390x/s390-pci-vfio.h +@@ -12,13 +12,25 @@ + #ifndef HW_S390_PCI_VFIO_H + #define HW_S390_PCI_VFIO_H + ++#include "hw/s390x/s390-pci-bus.h" ++ + #ifdef CONFIG_LINUX + bool s390_pci_update_dma_avail(int fd, unsigned int *avail); ++S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, ++ S390PCIBusDevice *pbdev); ++void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); + #else + static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) + { + return false; + } ++static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, ++ S390PCIBusDevice *pbdev) ++{ ++ return NULL; ++} ++static inline void s390_pci_end_dma_count(S390pciState *s, ++ S390PCIDMACount *cnt) { } + #endif + + #endif +-- +2.27.0 + diff --git a/kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch b/kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch new file mode 100755 index 0000000..27e5fa2 --- /dev/null +++ b/kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch @@ -0,0 +1,110 @@ +From 73fb2438518ef2073f2486fcf1dd8cddffb29228 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:41 -0500 +Subject: [PATCH 2/7] s390x/pci: Move header files to include/hw/s390x + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-3-cohuck@redhat.com> +Patchwork-id: 100676 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/7] s390x/pci: Move header files to include/hw/s390x +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +Seems a more appropriate location for them. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +Signed-off-by: Alex Williamson +(cherry picked from commit 408b55db8be3e3edae041d46ef8786fabc1476aa) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/s390-virtio-ccw.c + --> context diff + +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 1 + + hw/s390x/s390-pci-bus.c | 4 ++-- + hw/s390x/s390-pci-inst.c | 4 ++-- + hw/s390x/s390-virtio-ccw.c | 2 +- + {hw => include/hw}/s390x/s390-pci-bus.h | 0 + {hw => include/hw}/s390x/s390-pci-inst.h | 0 + 6 files changed, 6 insertions(+), 5 deletions(-) + rename {hw => include/hw}/s390x/s390-pci-bus.h (100%) + rename {hw => include/hw}/s390x/s390-pci-inst.h (100%) + +diff --git a/MAINTAINERS b/MAINTAINERS +index 2742c955754..56ca8193d86 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1225,6 +1225,7 @@ S390 PCI + M: Matthew Rosato + S: Supported + F: hw/s390x/s390-pci* ++F: include/hw/s390x/s390-pci* + L: qemu-s390x@nongnu.org + + UniCore32 Machines +diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c +index 2d2f4a7c419..6daef2b6d57 100644 +--- a/hw/s390x/s390-pci-bus.c ++++ b/hw/s390x/s390-pci-bus.c +@@ -15,8 +15,8 @@ + #include "qapi/error.h" + #include "qapi/visitor.h" + #include "cpu.h" +-#include "s390-pci-bus.h" +-#include "s390-pci-inst.h" ++#include "hw/s390x/s390-pci-bus.h" ++#include "hw/s390x/s390-pci-inst.h" + #include "hw/pci/pci_bus.h" + #include "hw/qdev-properties.h" + #include "hw/pci/pci_bridge.h" +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index 27b189e6d75..b1885344f18 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -13,12 +13,12 @@ + + #include "qemu/osdep.h" + #include "cpu.h" +-#include "s390-pci-inst.h" +-#include "s390-pci-bus.h" + #include "exec/memop.h" + #include "exec/memory-internal.h" + #include "qemu/error-report.h" + #include "sysemu/hw_accel.h" ++#include "hw/s390x/s390-pci-inst.h" ++#include "hw/s390x/s390-pci-bus.h" + #include "hw/s390x/tod.h" + + #ifndef DEBUG_S390PCI_INST +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 5b3d07f55c4..101f3b7c6e1 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -27,7 +27,7 @@ + #include "qemu/ctype.h" + #include "qemu/error-report.h" + #include "qemu/option.h" +-#include "s390-pci-bus.h" ++#include "hw/s390x/s390-pci-bus.h" + #include "sysemu/reset.h" + #include "hw/s390x/storage-keys.h" + #include "hw/s390x/storage-attributes.h" +diff --git a/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h +similarity index 100% +rename from hw/s390x/s390-pci-bus.h +rename to include/hw/s390x/s390-pci-bus.h +diff --git a/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h +similarity index 100% +rename from hw/s390x/s390-pci-inst.h +rename to include/hw/s390x/s390-pci-inst.h +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Add-migration-blocker.patch b/kvm-s390x-protvirt-Add-migration-blocker.patch new file mode 100755 index 0000000..056f8d5 --- /dev/null +++ b/kvm-s390x-protvirt-Add-migration-blocker.patch @@ -0,0 +1,79 @@ +From 0ba8d4ea1cc34230356cc446dfa8d1cb52cbd2f3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:05 -0400 +Subject: [PATCH 23/42] s390x: protvirt: Add migration blocker + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-24-thuth@redhat.com> +Patchwork-id: 97043 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 23/38] s390x: protvirt: Add migration blocker +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Migration is not yet supported. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-5-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0141e1b47707d90f5bd9d252da064ebdaca698a6) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 82da1d9ab5..dbd5125232 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -44,6 +44,9 @@ + #include "sysemu/sysemu.h" + #include "hw/s390x/pv.h" + #include ++#include "migration/blocker.h" ++ ++static Error *pv_mig_blocker; + + S390CPU *s390_cpu_addr2state(uint16_t cpu_addr) + { +@@ -325,15 +328,30 @@ static void s390_machine_unprotect(S390CcwMachineState *ms) + { + s390_pv_vm_disable(); + ms->pv = false; ++ migrate_del_blocker(pv_mig_blocker); ++ error_free_or_abort(&pv_mig_blocker); + } + + static int s390_machine_protect(S390CcwMachineState *ms) + { ++ Error *local_err = NULL; + int rc; + ++ error_setg(&pv_mig_blocker, ++ "protected VMs are currently not migrateable."); ++ rc = migrate_add_blocker(pv_mig_blocker, &local_err); ++ if (rc) { ++ error_report_err(local_err); ++ error_free_or_abort(&pv_mig_blocker); ++ return rc; ++ } ++ + /* Create SE VM */ + rc = s390_pv_vm_enable(); + if (rc) { ++ error_report_err(local_err); ++ migrate_del_blocker(pv_mig_blocker); ++ error_free_or_abort(&pv_mig_blocker); + return rc; + } + +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch b/kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch new file mode 100755 index 0000000..0cf75b0 --- /dev/null +++ b/kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch @@ -0,0 +1,135 @@ +From 1cfcff169f392179258e4535e60d4ef9cabae3c6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:13 -0400 +Subject: [PATCH 31/42] s390x: protvirt: Disable address checks for PV guest IO + emulation + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-32-thuth@redhat.com> +Patchwork-id: 97044 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 31/38] s390x: protvirt: Disable address checks for PV guest IO emulation +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +IO instruction data is routed through SIDAD for protected guests, so +adresses do not need to be checked, as this is kernel memory which is +always available. + +Also the instruction data always starts at offset 0 of the SIDAD. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-13-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit c10b708752e5264a85b5c3afa0a0ccfcf6503ddf) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/ioinst.c | 35 ++++++++++++++++++++++++++++------- + 1 file changed, 28 insertions(+), 7 deletions(-) + +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index c437a1d8c6..bbcccf6be2 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -16,6 +16,25 @@ + #include "hw/s390x/ioinst.h" + #include "trace.h" + #include "hw/s390x/s390-pci-bus.h" ++#include "hw/s390x/pv.h" ++ ++/* All I/O instructions but chsc use the s format */ ++static uint64_t get_address_from_regs(CPUS390XState *env, uint32_t ipb, ++ uint8_t *ar) ++{ ++ /* ++ * Addresses for protected guests are all offsets into the ++ * satellite block which holds the IO control structures. Those ++ * control structures are always starting at offset 0 and are ++ * always aligned and accessible. So we can return 0 here which ++ * will pass the following address checks. ++ */ ++ if (s390_is_pv()) { ++ *ar = 0; ++ return 0; ++ } ++ return decode_basedisp_s(env, ipb, ar); ++} + + int ioinst_disassemble_sch_ident(uint32_t value, int *m, int *cssid, int *ssid, + int *schid) +@@ -114,7 +133,7 @@ void ioinst_handle_msch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -171,7 +190,7 @@ void ioinst_handle_ssch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -203,7 +222,7 @@ void ioinst_handle_stcrw(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -234,7 +253,7 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -303,7 +322,7 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + return -EIO; + } + trace_ioinst_sch_id("tsch", cssid, ssid, schid); +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return -EIO; +@@ -601,7 +620,7 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + { + ChscReq *req; + ChscResp *res; +- uint64_t addr; ++ uint64_t addr = 0; + int reg; + uint16_t len; + uint16_t command; +@@ -610,7 +629,9 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + + trace_ioinst("chsc"); + reg = (ipb >> 20) & 0x00f; +- addr = env->regs[reg]; ++ if (!s390_is_pv()) { ++ addr = env->regs[reg]; ++ } + /* Page boundary? */ + if (addr & 0xfff) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch b/kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch new file mode 100755 index 0000000..9857f28 --- /dev/null +++ b/kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch @@ -0,0 +1,55 @@ +From b54e5e6df5d5bbe4dc0a206be9f6b6d971ce6f43 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:17 -0400 +Subject: [PATCH 35/42] s390x: protvirt: Fix stray error_report_err in + s390_machine_protect +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-36-thuth@redhat.com> +Patchwork-id: 97042 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 35/38] s390x: protvirt: Fix stray error_report_err in s390_machine_protect +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +In case the protection of the machine fails at s390_pv_vm_enable(), +we'll currently report the local_error variable. Problem is that +there's no migration blocker error that we can report at this point so +the pointer is always NULL which leads to a SEGFAULT. + +Let's remove the error report. + +Signed-off-by: Janosch Frank +Reported-by: Marc Hartmayer +Fixes: 0141e1b47707 ("s390x: protvirt: Add migration blocker") +Message-Id: <20200326140505.2432-1-frankja@linux.ibm.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Cornelia Huck +(cherry picked from commit 7152c9ecc6530ea145c122b0a58cc28802f630c6) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index b4ebe83766..c08e42bda1 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -360,7 +360,6 @@ static int s390_machine_protect(S390CcwMachineState *ms) + rc = s390_pv_vm_enable(); + if (rc) { + qemu_balloon_inhibit(false); +- error_report_err(local_err); + migrate_del_blocker(pv_mig_blocker); + error_free_or_abort(&pv_mig_blocker); + return rc; +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch b/kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch new file mode 100755 index 0000000..4d6a44b --- /dev/null +++ b/kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch @@ -0,0 +1,61 @@ +From 680154545d1f9d75fb33615b1900661e7d09be4e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:15 -0400 +Subject: [PATCH 33/42] s390x: protvirt: Handle SIGP store status correctly + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-34-thuth@redhat.com> +Patchwork-id: 97054 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 33/38] s390x: protvirt: Handle SIGP store status correctly +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected VMs status storing is not done by QEMU anymore. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-15-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f2a2d9a2bae8f6fdc5e9a40c1241e9428f15b4df) +[thuth: fixed contextual conflict due to missing commit 44eaccd091a7365fd37) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/helper.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/target/s390x/helper.c b/target/s390x/helper.c +index 6808dfda01..36b6d3d9d1 100644 +--- a/target/s390x/helper.c ++++ b/target/s390x/helper.c +@@ -25,6 +25,7 @@ + #include "qemu/timer.h" + #include "qemu/qemu-print.h" + #include "hw/s390x/ioinst.h" ++#include "hw/s390x/pv.h" + #include "sysemu/hw_accel.h" + #include "sysemu/runstate.h" + #ifndef CONFIG_USER_ONLY +@@ -246,6 +247,11 @@ int s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch) + hwaddr len = sizeof(*sa); + int i; + ++ /* For PVMs storing will occur when this cpu enters SIE again */ ++ if (s390_is_pv()) { ++ return 0; ++ } ++ + sa = cpu_physical_memory_map(addr, &len, 1); + if (!sa) { + return -EFAULT; +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch b/kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch new file mode 100755 index 0000000..a843d03 --- /dev/null +++ b/kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch @@ -0,0 +1,104 @@ +From 095553f9dd1fec02869bf974e8cc07614d6587e5 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:06 -0400 +Subject: [PATCH 24/42] s390x: protvirt: Inhibit balloon when switching to + protected mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-25-thuth@redhat.com> +Patchwork-id: 97036 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 24/38] s390x: protvirt: Inhibit balloon when switching to protected mode +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Ballooning in protected VMs can only be done when the guest shares the +pages it gives to the host. If pages are not shared, the integrity +checks will fail once those pages have been altered and are given back +to the guest. + +As we currently do not yet have a solution for this we will continue +like this: + +1. We block ballooning now in QEMU (with this patch). + +2. Later we will provide a change to virtio that removes the blocker +and adds VIRTIO_F_IOMMU_PLATFORM automatically by QEMU when doing the +protvirt switch. This is OK, as the balloon driver in Linux (the only +supported guest) will refuse to work with the IOMMU_PLATFORM feature +bit set. + +3. Later, we can fix the guest balloon driver to accept the IOMMU +feature bit and correctly exercise sharing and unsharing of balloon +pages. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-6-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b1697f63fd8f8201b1447bb55f595830b9cbde31) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index dbd5125232..b4ebe83766 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -42,6 +42,7 @@ + #include "hw/qdev-properties.h" + #include "hw/s390x/tod.h" + #include "sysemu/sysemu.h" ++#include "sysemu/balloon.h" + #include "hw/s390x/pv.h" + #include + #include "migration/blocker.h" +@@ -330,6 +331,7 @@ static void s390_machine_unprotect(S390CcwMachineState *ms) + ms->pv = false; + migrate_del_blocker(pv_mig_blocker); + error_free_or_abort(&pv_mig_blocker); ++ qemu_balloon_inhibit(false); + } + + static int s390_machine_protect(S390CcwMachineState *ms) +@@ -337,10 +339,18 @@ static int s390_machine_protect(S390CcwMachineState *ms) + Error *local_err = NULL; + int rc; + ++ /* ++ * Ballooning on protected VMs needs support in the guest for ++ * sharing and unsharing balloon pages. Block ballooning for ++ * now, until we have a solution to make at least Linux guests ++ * either support it or fail gracefully. ++ */ ++ qemu_balloon_inhibit(true); + error_setg(&pv_mig_blocker, + "protected VMs are currently not migrateable."); + rc = migrate_add_blocker(pv_mig_blocker, &local_err); + if (rc) { ++ qemu_balloon_inhibit(false); + error_report_err(local_err); + error_free_or_abort(&pv_mig_blocker); + return rc; +@@ -349,6 +359,7 @@ static int s390_machine_protect(S390CcwMachineState *ms) + /* Create SE VM */ + rc = s390_pv_vm_enable(); + if (rc) { ++ qemu_balloon_inhibit(false); + error_report_err(local_err); + migrate_del_blocker(pv_mig_blocker); + error_free_or_abort(&pv_mig_blocker); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-KVM-intercept-changes.patch b/kvm-s390x-protvirt-KVM-intercept-changes.patch new file mode 100755 index 0000000..2ac3d03 --- /dev/null +++ b/kvm-s390x-protvirt-KVM-intercept-changes.patch @@ -0,0 +1,75 @@ +From 10ed4f6ad687d98f0bfe06d75775e8c541da80a0 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:07 -0400 +Subject: [PATCH 25/42] s390x: protvirt: KVM intercept changes + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-26-thuth@redhat.com> +Patchwork-id: 97035 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 25/38] s390x: protvirt: KVM intercept changes +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Protected VMs no longer intercept with code 4 for an instruction +interception. Instead they have codes 104 and 108 for protected +instruction interception and protected instruction notification +respectively. + +The 104 mirrors the 4 interception. + +The 108 is a notification interception to let KVM and QEMU know that +something changed and we need to update tracking information or +perform specific tasks. It's currently taken for the following +instructions: + +* spx (To inform about the changed prefix location) +* sclp (On incorrect SCCB values, so we can inject a IRQ) +* sigp (All but "stop and store status") +* diag308 (Subcodes 0/1) + +Of these exits only sclp errors, state changing sigps and diag308 will +reach QEMU. QEMU will do its parts of the job, while the ultravisor +has done the instruction part of the job. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-7-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 2585e507ffa1da01b57dbea26b1e1fe507d27198) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 9a0be13959..af50b2c253 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -115,6 +115,8 @@ + #define ICPT_CPU_STOP 0x28 + #define ICPT_OPEREXC 0x2c + #define ICPT_IO 0x40 ++#define ICPT_PV_INSTR 0x68 ++#define ICPT_PV_INSTR_NOTIFICATION 0x6c + + #define NR_LOCAL_IRQS 32 + /* +@@ -1695,6 +1697,8 @@ static int handle_intercept(S390CPU *cpu) + (long)cs->kvm_run->psw_addr); + switch (icpt_code) { + case ICPT_INSTRUCTION: ++ case ICPT_PV_INSTR: ++ case ICPT_PV_INSTR_NOTIFICATION: + r = handle_instruction(cpu, run); + break; + case ICPT_PROGRAM: +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch b/kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch new file mode 100755 index 0000000..0609546 --- /dev/null +++ b/kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch @@ -0,0 +1,171 @@ +From 8345b90f43b14435938fbbe0f3a510a60f5d0ded Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:14 -0400 +Subject: [PATCH 32/42] s390x: protvirt: Move IO control structures over SIDA + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-33-thuth@redhat.com> +Patchwork-id: 97040 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 32/38] s390x: protvirt: Move IO control structures over SIDA +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected guests, we need to put the IO emulation results into the +SIDA, so SIE will write them into the guest at the next entry. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-14-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit fcc10c1470d6e9460ebcf4c30f5bbd37b921a041) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/ioinst.c | 61 +++++++++++++++++++++++++++++++------------ + 1 file changed, 45 insertions(+), 16 deletions(-) + +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index bbcccf6be2..f40c35c6ff 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -138,7 +138,9 @@ void ioinst_handle_msch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; + } +- if (s390_cpu_virt_mem_read(cpu, addr, ar, &schib, sizeof(schib))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, addr, &schib, sizeof(schib)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, ar, &schib, sizeof(schib))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -195,7 +197,9 @@ void ioinst_handle_ssch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; + } +- if (s390_cpu_virt_mem_read(cpu, addr, ar, &orig_orb, sizeof(orb))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, addr, &orig_orb, sizeof(orb)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, ar, &orig_orb, sizeof(orb))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -231,14 +235,19 @@ void ioinst_handle_stcrw(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + cc = css_do_stcrw(&crw); + /* 0 - crw stored, 1 - zeroes stored */ + +- if (s390_cpu_virt_mem_write(cpu, addr, ar, &crw, sizeof(crw)) == 0) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr, &crw, sizeof(crw)); + setcc(cpu, cc); + } else { +- if (cc == 0) { +- /* Write failed: requeue CRW since STCRW is suppressing */ +- css_undo_stcrw(&crw); ++ if (s390_cpu_virt_mem_write(cpu, addr, ar, &crw, sizeof(crw)) == 0) { ++ setcc(cpu, cc); ++ } else { ++ if (cc == 0) { ++ /* Write failed: requeue CRW since STCRW is suppressing */ ++ css_undo_stcrw(&crw); ++ } ++ s390_cpu_virt_mem_handle_exc(cpu, ra); + } +- s390_cpu_virt_mem_handle_exc(cpu, ra); + } + } + +@@ -260,6 +269,13 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + } + + if (ioinst_disassemble_sch_ident(reg1, &m, &cssid, &ssid, &schid)) { ++ /* ++ * The Ultravisor checks schid bit 16 to be one and bits 0-12 ++ * to be 0 and injects a operand exception itself. ++ * ++ * Hence we should never end up here. ++ */ ++ g_assert(!s390_is_pv()); + /* + * As operand exceptions have a lower priority than access exceptions, + * we check whether the memory area is writeable (injecting the +@@ -292,14 +308,17 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + } + } + if (cc != 3) { +- if (s390_cpu_virt_mem_write(cpu, addr, ar, &schib, +- sizeof(schib)) != 0) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr, &schib, sizeof(schib)); ++ } else if (s390_cpu_virt_mem_write(cpu, addr, ar, &schib, ++ sizeof(schib)) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } + } else { + /* Access exceptions have a higher priority than cc3 */ +- if (s390_cpu_virt_mem_check_write(cpu, addr, ar, sizeof(schib)) != 0) { ++ if (!s390_is_pv() && ++ s390_cpu_virt_mem_check_write(cpu, addr, ar, sizeof(schib)) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -336,7 +355,9 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + } + /* 0 - status pending, 1 - not status pending, 3 - not operational */ + if (cc != 3) { +- if (s390_cpu_virt_mem_write(cpu, addr, ar, &irb, irb_len) != 0) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr, &irb, irb_len); ++ } else if (s390_cpu_virt_mem_write(cpu, addr, ar, &irb, irb_len) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return -EFAULT; + } +@@ -344,7 +365,8 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + } else { + irb_len = sizeof(irb) - sizeof(irb.emw); + /* Access exceptions have a higher priority than cc3 */ +- if (s390_cpu_virt_mem_check_write(cpu, addr, ar, irb_len) != 0) { ++ if (!s390_is_pv() && ++ s390_cpu_virt_mem_check_write(cpu, addr, ar, irb_len) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return -EFAULT; + } +@@ -642,7 +664,9 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + * present CHSC sub-handlers ... if we ever need more, we should take + * care of req->len here first. + */ +- if (s390_cpu_virt_mem_read(cpu, addr, reg, buf, sizeof(ChscReq))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, addr, buf, sizeof(ChscReq)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, reg, buf, sizeof(ChscReq))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -675,11 +699,16 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + break; + } + +- if (!s390_cpu_virt_mem_write(cpu, addr + len, reg, res, +- be16_to_cpu(res->len))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr + len, res, be16_to_cpu(res->len)); + setcc(cpu, 0); /* Command execution complete */ + } else { +- s390_cpu_virt_mem_handle_exc(cpu, ra); ++ if (!s390_cpu_virt_mem_write(cpu, addr + len, reg, res, ++ be16_to_cpu(res->len))) { ++ setcc(cpu, 0); /* Command execution complete */ ++ } else { ++ s390_cpu_virt_mem_handle_exc(cpu, ra); ++ } + } + } + +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch b/kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch new file mode 100755 index 0000000..1d60070 --- /dev/null +++ b/kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch @@ -0,0 +1,70 @@ +From 27f5d8a3af2863e39b7c46a3128009988d772f15 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:09 -0400 +Subject: [PATCH 27/42] s390x: protvirt: Move STSI data over SIDAD + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-28-thuth@redhat.com> +Patchwork-id: 97046 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 27/38] s390x: protvirt: Move STSI data over SIDAD +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected guests, we need to put the STSI emulation results into +the SIDA, so SIE will write them into the guest at the next entry. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-9-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 7c713b8acb70fb61f9650f8a7702dec546752bb6) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index f67bb5ce2c..6809a5ac40 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -50,6 +50,7 @@ + #include "exec/memattrs.h" + #include "hw/s390x/s390-virtio-ccw.h" + #include "hw/s390x/s390-virtio-hcall.h" ++#include "hw/s390x/pv.h" + + #ifndef DEBUG_KVM + #define DEBUG_KVM 0 +@@ -1803,7 +1804,9 @@ static void insert_stsi_3_2_2(S390CPU *cpu, __u64 addr, uint8_t ar) + SysIB_322 sysib; + int del; + +- if (s390_cpu_virt_mem_read(cpu, addr, ar, &sysib, sizeof(sysib))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, 0, &sysib, sizeof(sysib)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, ar, &sysib, sizeof(sysib))) { + return; + } + /* Shift the stack of Extended Names to prepare for our own data */ +@@ -1843,7 +1846,11 @@ static void insert_stsi_3_2_2(S390CPU *cpu, __u64 addr, uint8_t ar) + /* Insert UUID */ + memcpy(sysib.vm[0].uuid, &qemu_uuid, sizeof(sysib.vm[0].uuid)); + +- s390_cpu_virt_mem_write(cpu, addr, ar, &sysib, sizeof(sysib)); ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, 0, &sysib, sizeof(sysib)); ++ } else { ++ s390_cpu_virt_mem_write(cpu, addr, ar, &sysib, sizeof(sysib)); ++ } + } + + static int handle_stsi(S390CPU *cpu) +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch b/kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch new file mode 100755 index 0000000..1b22719 --- /dev/null +++ b/kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch @@ -0,0 +1,93 @@ +From 33d4e21cfd236aecd9e4dbe8228d058fd1f22400 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:12 -0400 +Subject: [PATCH 30/42] s390x: protvirt: Move diag 308 data over SIDA + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-31-thuth@redhat.com> +Patchwork-id: 97048 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 30/38] s390x: protvirt: Move diag 308 data over SIDA +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected guests the IPIB is written/read to/from the SIDA, so we +need those accesses to go through s390_cpu_pv_mem_read/write(). + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-12-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 9c61e11238cfa8f70e3eb90aac5d3e5646e5432f) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/diag.c | 25 ++++++++++++++++++++----- + 1 file changed, 20 insertions(+), 5 deletions(-) + +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index b2cbefb8cf..1a48429564 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -75,6 +75,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + { + bool valid; + CPUState *cs = env_cpu(env); ++ S390CPU *cpu = S390_CPU(cs); + uint64_t addr = env->regs[r1]; + uint64_t subcode = env->regs[r3]; + IplParameterBlock *iplb; +@@ -111,13 +112,22 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + return; + } + iplb = g_new0(IplParameterBlock, 1); +- cpu_physical_memory_read(addr, iplb, sizeof(iplb->len)); ++ if (!s390_is_pv()) { ++ cpu_physical_memory_read(addr, iplb, sizeof(iplb->len)); ++ } else { ++ s390_cpu_pv_mem_read(cpu, 0, iplb, sizeof(iplb->len)); ++ } ++ + if (!iplb_valid_len(iplb)) { + env->regs[r1 + 1] = DIAG_308_RC_INVALID; + goto out; + } + +- cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); ++ if (!s390_is_pv()) { ++ cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); ++ } else { ++ s390_cpu_pv_mem_read(cpu, 0, iplb, be32_to_cpu(iplb->len)); ++ } + + valid = subcode == DIAG308_PV_SET ? iplb_valid_pv(iplb) : iplb_valid(iplb); + if (!valid) { +@@ -140,12 +150,17 @@ out: + } else { + iplb = s390_ipl_get_iplb(); + } +- if (iplb) { ++ if (!iplb) { ++ env->regs[r1 + 1] = DIAG_308_RC_NO_CONF; ++ return; ++ } ++ ++ if (!s390_is_pv()) { + cpu_physical_memory_write(addr, iplb, be32_to_cpu(iplb->len)); +- env->regs[r1 + 1] = DIAG_308_RC_OK; + } else { +- env->regs[r1 + 1] = DIAG_308_RC_NO_CONF; ++ s390_cpu_pv_mem_write(cpu, 0, iplb, be32_to_cpu(iplb->len)); + } ++ env->regs[r1 + 1] = DIAG_308_RC_OK; + return; + case DIAG308_PV_START: + iplb = s390_ipl_get_iplb_pv(); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-SCLP-interpretation.patch b/kvm-s390x-protvirt-SCLP-interpretation.patch new file mode 100755 index 0000000..10f1930 --- /dev/null +++ b/kvm-s390x-protvirt-SCLP-interpretation.patch @@ -0,0 +1,172 @@ +From 5a8b40c3fdafeb49072f8643210bea00ce1478c4 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:10 -0400 +Subject: [PATCH 28/42] s390x: protvirt: SCLP interpretation + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-29-thuth@redhat.com> +Patchwork-id: 97053 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 28/38] s390x: protvirt: SCLP interpretation +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +SCLP for a protected guest is done over the SIDAD, so we need to use +the s390_cpu_pv_mem_* functions to access the SIDAD instead of guest +memory when reading/writing SCBs. + +To not confuse the sclp emulation, we set 0x4000 as the SCCB address, +since the function that injects the sclp external interrupt would +reject a zero sccb address. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Reviewed-by: Christian Borntraeger +Message-Id: <20200319131921.2367-10-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0f73c5b30b8ba6c0828608be496d2f59a5427539) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 56 +++++++++++++++++++++++++++++++++-------- + include/hw/s390x/sclp.h | 2 ++ + target/s390x/kvm.c | 25 ++++++++++++++---- + 3 files changed, 67 insertions(+), 16 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index f57ce7b739..1c380a49cc 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -33,6 +33,22 @@ static inline SCLPDevice *get_sclp_device(void) + return sclp; + } + ++static inline bool sclp_command_code_valid(uint32_t code) ++{ ++ switch (code & SCLP_CMD_CODE_MASK) { ++ case SCLP_CMDW_READ_SCP_INFO: ++ case SCLP_CMDW_READ_SCP_INFO_FORCED: ++ case SCLP_CMDW_READ_CPU_INFO: ++ case SCLP_CMDW_CONFIGURE_IOA: ++ case SCLP_CMDW_DECONFIGURE_IOA: ++ case SCLP_CMD_READ_EVENT_DATA: ++ case SCLP_CMD_WRITE_EVENT_DATA: ++ case SCLP_CMD_WRITE_EVENT_MASK: ++ return true; ++ } ++ return false; ++} ++ + static void prepare_cpu_entries(SCLPDevice *sclp, CPUEntry *entry, int *count) + { + MachineState *ms = MACHINE(qdev_get_machine()); +@@ -193,6 +209,34 @@ static void sclp_execute(SCLPDevice *sclp, SCCB *sccb, uint32_t code) + } + } + ++/* ++ * We only need the address to have something valid for the ++ * service_interrupt call. ++ */ ++#define SCLP_PV_DUMMY_ADDR 0x4000 ++int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, ++ uint32_t code) ++{ ++ SCLPDevice *sclp = get_sclp_device(); ++ SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); ++ SCCB work_sccb; ++ hwaddr sccb_len = sizeof(SCCB); ++ ++ s390_cpu_pv_mem_read(env_archcpu(env), 0, &work_sccb, sccb_len); ++ ++ if (!sclp_command_code_valid(code)) { ++ work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); ++ goto out_write; ++ } ++ ++ sclp_c->execute(sclp, &work_sccb, code); ++out_write: ++ s390_cpu_pv_mem_write(env_archcpu(env), 0, &work_sccb, ++ be16_to_cpu(work_sccb.h.length)); ++ sclp_c->service_interrupt(sclp, SCLP_PV_DUMMY_ADDR); ++ return 0; ++} ++ + int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + { + SCLPDevice *sclp = get_sclp_device(); +@@ -230,17 +274,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + goto out; + } + +- switch (code & SCLP_CMD_CODE_MASK) { +- case SCLP_CMDW_READ_SCP_INFO: +- case SCLP_CMDW_READ_SCP_INFO_FORCED: +- case SCLP_CMDW_READ_CPU_INFO: +- case SCLP_CMDW_CONFIGURE_IOA: +- case SCLP_CMDW_DECONFIGURE_IOA: +- case SCLP_CMD_READ_EVENT_DATA: +- case SCLP_CMD_WRITE_EVENT_DATA: +- case SCLP_CMD_WRITE_EVENT_MASK: +- break; +- default: ++ if (!sclp_command_code_valid(code)) { + work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); + goto out_write; + } +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index c54413b78c..c0a3faa37d 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -217,5 +217,7 @@ void s390_sclp_init(void); + void sclp_service_interrupt(uint32_t sccb); + void raise_irq_cpu_hotplug(void); + int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code); ++int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, ++ uint32_t code); + + #endif +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 6809a5ac40..56fe60c49c 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -1230,12 +1230,27 @@ static void kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, + sccb = env->regs[ipbh0 & 0xf]; + code = env->regs[(ipbh0 & 0xf0) >> 4]; + +- r = sclp_service_call(env, sccb, code); +- if (r < 0) { +- kvm_s390_program_interrupt(cpu, -r); +- return; ++ switch (run->s390_sieic.icptcode) { ++ case ICPT_PV_INSTR_NOTIFICATION: ++ g_assert(s390_is_pv()); ++ /* The notification intercepts are currently handled by KVM */ ++ error_report("unexpected SCLP PV notification"); ++ exit(1); ++ break; ++ case ICPT_PV_INSTR: ++ g_assert(s390_is_pv()); ++ sclp_service_call_protected(env, sccb, code); ++ /* Setting the CC is done by the Ultravisor. */ ++ break; ++ case ICPT_INSTRUCTION: ++ g_assert(!s390_is_pv()); ++ r = sclp_service_call(env, sccb, code); ++ if (r < 0) { ++ kvm_s390_program_interrupt(cpu, -r); ++ return; ++ } ++ setcc(cpu, r); + } +- setcc(cpu, r); + } + + static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1) +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Set-guest-IPL-PSW.patch b/kvm-s390x-protvirt-Set-guest-IPL-PSW.patch new file mode 100755 index 0000000..ef246c7 --- /dev/null +++ b/kvm-s390x-protvirt-Set-guest-IPL-PSW.patch @@ -0,0 +1,75 @@ +From d738b4336c79be68b6040f73427e089f46957728 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:11 -0400 +Subject: [PATCH 29/42] s390x: protvirt: Set guest IPL PSW + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-30-thuth@redhat.com> +Patchwork-id: 97049 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 29/38] s390x: protvirt: Set guest IPL PSW +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Handling of CPU reset and setting of the IPL psw from guest storage at +offset 0 is done by a Ultravisor call. Let's only fetch it if +necessary. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-11-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 59181010a2ff82c3a97e9b5768ee87c38e4815f1) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 26 +++++++++++++++++--------- + 1 file changed, 17 insertions(+), 9 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 8f38cd8e6f..371b91b2d7 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -77,16 +77,24 @@ static bool s390_cpu_has_work(CPUState *cs) + static void s390_cpu_load_normal(CPUState *s) + { + S390CPU *cpu = S390_CPU(s); +- uint64_t spsw = ldq_phys(s->as, 0); +- +- cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL; +- /* +- * Invert short psw indication, so SIE will report a specification +- * exception if it was not set. +- */ +- cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; +- cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR; ++ uint64_t spsw; + ++ if (!s390_is_pv()) { ++ spsw = ldq_phys(s->as, 0); ++ cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL; ++ /* ++ * Invert short psw indication, so SIE will report a specification ++ * exception if it was not set. ++ */ ++ cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; ++ cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR; ++ } else { ++ /* ++ * Firmware requires us to set the load state before we set ++ * the cpu to operating on protected guests. ++ */ ++ s390_cpu_set_state(S390_CPU_STATE_LOAD, cpu); ++ } + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } + #endif +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Support-unpack-facility.patch b/kvm-s390x-protvirt-Support-unpack-facility.patch new file mode 100755 index 0000000..204de2a --- /dev/null +++ b/kvm-s390x-protvirt-Support-unpack-facility.patch @@ -0,0 +1,886 @@ +From e6474080e3816e82e87c545a3d22db77c55ab053 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:04 -0400 +Subject: [PATCH 22/42] s390x: protvirt: Support unpack facility + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-23-thuth@redhat.com> +Patchwork-id: 97045 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 22/38] s390x: protvirt: Support unpack facility +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The unpack facility provides the means to setup a protected guest. A +protected guest cannot be introspected by the hypervisor or any +user/administrator of the machine it is running on. + +Protected guests are encrypted at rest and need a special boot +mechanism via diag308 subcode 8 and 10. + +Code 8 sets the PV specific IPLB which is retained separately from +those set via code 5. + +Code 10 is used to unpack the VM into protected memory, verify its +integrity and start it. + +Signed-off-by: Janosch Frank +Co-developed-by: Christian Borntraeger [Changes +to machine] +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200323083606.24520-1-frankja@linux.ibm.com> +[CH: fixed up KVM_PV_VM_ -> KVM_PV_] +Signed-off-by: Cornelia Huck +(cherry picked from commit c3347ed0d2ee42a7dcf7bfe7f9c3884a9596727a) +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 2 + + hw/s390x/Makefile.objs | 1 + + hw/s390x/ipl.c | 59 +++++++++++++- + hw/s390x/ipl.h | 91 ++++++++++++++++++++- + hw/s390x/pv.c | 98 +++++++++++++++++++++++ + hw/s390x/s390-virtio-ccw.c | 119 +++++++++++++++++++++++++++- + include/hw/s390x/pv.h | 55 +++++++++++++ + include/hw/s390x/s390-virtio-ccw.h | 1 + + target/s390x/cpu.c | 1 + + target/s390x/cpu_features_def.inc.h | 1 + + target/s390x/diag.c | 39 ++++++++- + target/s390x/kvm-stub.c | 5 ++ + target/s390x/kvm.c | 5 ++ + target/s390x/kvm_s390x.h | 1 + + 14 files changed, 468 insertions(+), 10 deletions(-) + create mode 100644 hw/s390x/pv.c + create mode 100644 include/hw/s390x/pv.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index 49d5d44edc..2742c95575 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -385,6 +385,8 @@ F: target/s390x/machine.c + F: target/s390x/sigp.c + F: target/s390x/cpu_features*.[ch] + F: target/s390x/cpu_models.[ch] ++F: hw/s390x/pv.c ++F: include/hw/s390x/pv.h + F: hw/intc/s390_flic.c + F: hw/intc/s390_flic_kvm.c + F: include/hw/s390x/s390_flic.h +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index e02ed80b68..a46a1c7894 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -31,6 +31,7 @@ obj-y += tod-qemu.o + obj-$(CONFIG_KVM) += tod-kvm.o + obj-$(CONFIG_KVM) += s390-skeys-kvm.o + obj-$(CONFIG_KVM) += s390-stattrib-kvm.o ++obj-$(CONFIG_KVM) += pv.o + obj-y += s390-ccw.o + obj-y += ap-device.o + obj-y += ap-bridge.o +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index fa0409dc23..586d95b5b6 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -1,10 +1,11 @@ + /* + * bootloader support + * +- * Copyright IBM, Corp. 2012 ++ * Copyright IBM, Corp. 2012, 2020 + * + * Authors: + * Christian Borntraeger ++ * Janosch Frank + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at your + * option) any later version. See the COPYING file in the top-level directory. +@@ -27,6 +28,7 @@ + #include "hw/s390x/vfio-ccw.h" + #include "hw/s390x/css.h" + #include "hw/s390x/ebcdic.h" ++#include "hw/s390x/pv.h" + #include "ipl.h" + #include "qemu/error-report.h" + #include "qemu/config-file.h" +@@ -557,12 +559,31 @@ void s390_ipl_update_diag308(IplParameterBlock *iplb) + { + S390IPLState *ipl = get_ipl_device(); + +- ipl->iplb = *iplb; +- ipl->iplb_valid = true; ++ /* ++ * The IPLB set and retrieved by subcodes 8/9 is completely ++ * separate from the one managed via subcodes 5/6. ++ */ ++ if (iplb->pbt == S390_IPL_TYPE_PV) { ++ ipl->iplb_pv = *iplb; ++ ipl->iplb_valid_pv = true; ++ } else { ++ ipl->iplb = *iplb; ++ ipl->iplb_valid = true; ++ } + ipl->netboot = is_virtio_net_device(iplb); + update_machine_ipl_properties(iplb); + } + ++IplParameterBlock *s390_ipl_get_iplb_pv(void) ++{ ++ S390IPLState *ipl = get_ipl_device(); ++ ++ if (!ipl->iplb_valid_pv) { ++ return NULL; ++ } ++ return &ipl->iplb_pv; ++} ++ + IplParameterBlock *s390_ipl_get_iplb(void) + { + S390IPLState *ipl = get_ipl_device(); +@@ -651,6 +672,38 @@ static void s390_ipl_prepare_qipl(S390CPU *cpu) + cpu_physical_memory_unmap(addr, len, 1, len); + } + ++int s390_ipl_prepare_pv_header(void) ++{ ++ IplParameterBlock *ipib = s390_ipl_get_iplb_pv(); ++ IPLBlockPV *ipib_pv = &ipib->pv; ++ void *hdr = g_malloc(ipib_pv->pv_header_len); ++ int rc; ++ ++ cpu_physical_memory_read(ipib_pv->pv_header_addr, hdr, ++ ipib_pv->pv_header_len); ++ rc = s390_pv_set_sec_parms((uintptr_t)hdr, ++ ipib_pv->pv_header_len); ++ g_free(hdr); ++ return rc; ++} ++ ++int s390_ipl_pv_unpack(void) ++{ ++ IplParameterBlock *ipib = s390_ipl_get_iplb_pv(); ++ IPLBlockPV *ipib_pv = &ipib->pv; ++ int i, rc = 0; ++ ++ for (i = 0; i < ipib_pv->num_comp; i++) { ++ rc = s390_pv_unpack(ipib_pv->components[i].addr, ++ TARGET_PAGE_ALIGN(ipib_pv->components[i].size), ++ ipib_pv->components[i].tweak_pref); ++ if (rc) { ++ break; ++ } ++ } ++ return rc; ++} ++ + void s390_ipl_prepare_cpu(S390CPU *cpu) + { + S390IPLState *ipl = get_ipl_device(); +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index a5665e6bfd..89b3044d7a 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -1,8 +1,9 @@ + /* + * s390 IPL device + * +- * Copyright 2015 IBM Corp. ++ * Copyright 2015, 2020 IBM Corp. + * Author(s): Zhang Fan ++ * Janosch Frank + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level +@@ -15,6 +16,24 @@ + #include "cpu.h" + #include "hw/qdev-core.h" + ++struct IPLBlockPVComp { ++ uint64_t tweak_pref; ++ uint64_t addr; ++ uint64_t size; ++} QEMU_PACKED; ++typedef struct IPLBlockPVComp IPLBlockPVComp; ++ ++struct IPLBlockPV { ++ uint8_t reserved18[87]; /* 0x18 */ ++ uint8_t version; /* 0x6f */ ++ uint32_t reserved70; /* 0x70 */ ++ uint32_t num_comp; /* 0x74 */ ++ uint64_t pv_header_addr; /* 0x78 */ ++ uint64_t pv_header_len; /* 0x80 */ ++ struct IPLBlockPVComp components[]; ++} QEMU_PACKED; ++typedef struct IPLBlockPV IPLBlockPV; ++ + struct IplBlockCcw { + uint8_t reserved0[85]; + uint8_t ssid; +@@ -71,6 +90,7 @@ union IplParameterBlock { + union { + IplBlockCcw ccw; + IplBlockFcp fcp; ++ IPLBlockPV pv; + IplBlockQemuScsi scsi; + }; + } QEMU_PACKED; +@@ -85,8 +105,11 @@ typedef union IplParameterBlock IplParameterBlock; + + int s390_ipl_set_loadparm(uint8_t *loadparm); + void s390_ipl_update_diag308(IplParameterBlock *iplb); ++int s390_ipl_prepare_pv_header(void); ++int s390_ipl_pv_unpack(void); + void s390_ipl_prepare_cpu(S390CPU *cpu); + IplParameterBlock *s390_ipl_get_iplb(void); ++IplParameterBlock *s390_ipl_get_iplb_pv(void); + + enum s390_reset { + /* default is a reset not triggered by a CPU e.g. issued by QMP */ +@@ -94,6 +117,7 @@ enum s390_reset { + S390_RESET_REIPL, + S390_RESET_MODIFIED_CLEAR, + S390_RESET_LOAD_NORMAL, ++ S390_RESET_PV, + }; + void s390_ipl_reset_request(CPUState *cs, enum s390_reset reset_type); + void s390_ipl_get_reset_request(CPUState **cs, enum s390_reset *reset_type); +@@ -133,6 +157,7 @@ struct S390IPLState { + /*< private >*/ + DeviceState parent_obj; + IplParameterBlock iplb; ++ IplParameterBlock iplb_pv; + QemuIplParameters qipl; + uint64_t start_addr; + uint64_t compat_start_addr; +@@ -140,6 +165,7 @@ struct S390IPLState { + uint64_t compat_bios_start_addr; + bool enforce_bios; + bool iplb_valid; ++ bool iplb_valid_pv; + bool netboot; + /* reset related properties don't have to be migrated or reset */ + enum s390_reset reset_type; +@@ -162,6 +188,8 @@ QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of iplb wrong"); + #define DIAG_308_RC_OK 0x0001 + #define DIAG_308_RC_NO_CONF 0x0102 + #define DIAG_308_RC_INVALID 0x0402 ++#define DIAG_308_RC_NO_PV_CONF 0x0902 ++#define DIAG_308_RC_INVAL_FOR_PV 0x0a02 + + #define DIAG308_RESET_MOD_CLR 0 + #define DIAG308_RESET_LOAD_NORM 1 +@@ -169,12 +197,17 @@ QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of iplb wrong"); + #define DIAG308_LOAD_NORMAL_DUMP 4 + #define DIAG308_SET 5 + #define DIAG308_STORE 6 ++#define DIAG308_PV_SET 8 ++#define DIAG308_PV_STORE 9 ++#define DIAG308_PV_START 10 + + #define S390_IPL_TYPE_FCP 0x00 + #define S390_IPL_TYPE_CCW 0x02 ++#define S390_IPL_TYPE_PV 0x05 + #define S390_IPL_TYPE_QEMU_SCSI 0xff + + #define S390_IPLB_HEADER_LEN 8 ++#define S390_IPLB_MIN_PV_LEN 148 + #define S390_IPLB_MIN_CCW_LEN 200 + #define S390_IPLB_MIN_FCP_LEN 384 + #define S390_IPLB_MIN_QEMU_SCSI_LEN 200 +@@ -184,6 +217,62 @@ static inline bool iplb_valid_len(IplParameterBlock *iplb) + return be32_to_cpu(iplb->len) <= sizeof(IplParameterBlock); + } + ++static inline bool ipl_valid_pv_components(IplParameterBlock *iplb) ++{ ++ IPLBlockPV *ipib_pv = &iplb->pv; ++ int i; ++ ++ if (ipib_pv->num_comp == 0) { ++ return false; ++ } ++ ++ for (i = 0; i < ipib_pv->num_comp; i++) { ++ /* Addr must be 4k aligned */ ++ if (ipib_pv->components[i].addr & ~TARGET_PAGE_MASK) { ++ return false; ++ } ++ ++ /* Tweak prefix is monotonically increasing with each component */ ++ if (i < ipib_pv->num_comp - 1 && ++ ipib_pv->components[i].tweak_pref >= ++ ipib_pv->components[i + 1].tweak_pref) { ++ return false; ++ } ++ } ++ return true; ++} ++ ++static inline bool ipl_valid_pv_header(IplParameterBlock *iplb) ++{ ++ IPLBlockPV *ipib_pv = &iplb->pv; ++ ++ if (ipib_pv->pv_header_len > 2 * TARGET_PAGE_SIZE) { ++ return false; ++ } ++ ++ if (!address_space_access_valid(&address_space_memory, ++ ipib_pv->pv_header_addr, ++ ipib_pv->pv_header_len, ++ false, ++ MEMTXATTRS_UNSPECIFIED)) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static inline bool iplb_valid_pv(IplParameterBlock *iplb) ++{ ++ if (iplb->pbt != S390_IPL_TYPE_PV || ++ be32_to_cpu(iplb->len) < S390_IPLB_MIN_PV_LEN) { ++ return false; ++ } ++ if (!ipl_valid_pv_header(iplb)) { ++ return false; ++ } ++ return ipl_valid_pv_components(iplb); ++} ++ + static inline bool iplb_valid(IplParameterBlock *iplb) + { + switch (iplb->pbt) { +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +new file mode 100644 +index 0000000000..a40a844806 +--- /dev/null ++++ b/hw/s390x/pv.c +@@ -0,0 +1,98 @@ ++/* ++ * Protected Virtualization functions ++ * ++ * Copyright IBM Corp. 2020 ++ * Author(s): ++ * Janosch Frank ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++#include "qemu/osdep.h" ++ ++#include ++ ++#include "qemu/error-report.h" ++#include "sysemu/kvm.h" ++#include "hw/s390x/pv.h" ++ ++static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) ++{ ++ struct kvm_pv_cmd pv_cmd = { ++ .cmd = cmd, ++ .data = (uint64_t)data, ++ }; ++ int rc = kvm_vm_ioctl(kvm_state, KVM_S390_PV_COMMAND, &pv_cmd); ++ ++ if (rc) { ++ error_report("KVM PV command %d (%s) failed: header rc %x rrc %x " ++ "IOCTL rc: %d", cmd, cmdname, pv_cmd.rc, pv_cmd.rrc, ++ rc); ++ } ++ return rc; ++} ++ ++/* ++ * This macro lets us pass the command as a string to the function so ++ * we can print it on an error. ++ */ ++#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data); ++#define s390_pv_cmd_exit(cmd, data) \ ++{ \ ++ int rc; \ ++ \ ++ rc = __s390_pv_cmd(cmd, #cmd, data);\ ++ if (rc) { \ ++ exit(1); \ ++ } \ ++} ++ ++int s390_pv_vm_enable(void) ++{ ++ return s390_pv_cmd(KVM_PV_ENABLE, NULL); ++} ++ ++void s390_pv_vm_disable(void) ++{ ++ s390_pv_cmd_exit(KVM_PV_DISABLE, NULL); ++} ++ ++int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) ++{ ++ struct kvm_s390_pv_sec_parm args = { ++ .origin = origin, ++ .length = length, ++ }; ++ ++ return s390_pv_cmd(KVM_PV_SET_SEC_PARMS, &args); ++} ++ ++/* ++ * Called for each component in the SE type IPL parameter block 0. ++ */ ++int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) ++{ ++ struct kvm_s390_pv_unp args = { ++ .addr = addr, ++ .size = size, ++ .tweak = tweak, ++ }; ++ ++ return s390_pv_cmd(KVM_PV_UNPACK, &args); ++} ++ ++void s390_pv_perf_clear_reset(void) ++{ ++ s390_pv_cmd_exit(KVM_PV_PREP_RESET, NULL); ++} ++ ++int s390_pv_verify(void) ++{ ++ return s390_pv_cmd(KVM_PV_VERIFY, NULL); ++} ++ ++void s390_pv_unshare(void) ++{ ++ s390_pv_cmd_exit(KVM_PV_UNSHARE_ALL, NULL); ++} +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 4ea01c53c0..82da1d9ab5 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -1,9 +1,10 @@ + /* + * virtio ccw machine + * +- * Copyright 2012 IBM Corp. ++ * Copyright 2012, 2020 IBM Corp. + * Copyright (c) 2009 Alexander Graf + * Author(s): Cornelia Huck ++ * Janosch Frank + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level +@@ -41,6 +42,8 @@ + #include "hw/qdev-properties.h" + #include "hw/s390x/tod.h" + #include "sysemu/sysemu.h" ++#include "hw/s390x/pv.h" ++#include + + S390CPU *s390_cpu_addr2state(uint16_t cpu_addr) + { +@@ -318,10 +321,78 @@ static inline void s390_do_cpu_ipl(CPUState *cs, run_on_cpu_data arg) + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } + ++static void s390_machine_unprotect(S390CcwMachineState *ms) ++{ ++ s390_pv_vm_disable(); ++ ms->pv = false; ++} ++ ++static int s390_machine_protect(S390CcwMachineState *ms) ++{ ++ int rc; ++ ++ /* Create SE VM */ ++ rc = s390_pv_vm_enable(); ++ if (rc) { ++ return rc; ++ } ++ ++ ms->pv = true; ++ ++ /* Set SE header and unpack */ ++ rc = s390_ipl_prepare_pv_header(); ++ if (rc) { ++ goto out_err; ++ } ++ ++ /* Decrypt image */ ++ rc = s390_ipl_pv_unpack(); ++ if (rc) { ++ goto out_err; ++ } ++ ++ /* Verify integrity */ ++ rc = s390_pv_verify(); ++ if (rc) { ++ goto out_err; ++ } ++ return rc; ++ ++out_err: ++ s390_machine_unprotect(ms); ++ return rc; ++} ++ ++static void s390_machine_inject_pv_error(CPUState *cs) ++{ ++ int r1 = (cs->kvm_run->s390_sieic.ipa & 0x00f0) >> 4; ++ CPUS390XState *env = &S390_CPU(cs)->env; ++ ++ /* Report that we are unable to enter protected mode */ ++ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; ++} ++ ++static void s390_pv_prepare_reset(S390CcwMachineState *ms) ++{ ++ CPUState *cs; ++ ++ if (!s390_is_pv()) { ++ return; ++ } ++ /* Unsharing requires all cpus to be stopped */ ++ CPU_FOREACH(cs) { ++ s390_cpu_set_state(S390_CPU_STATE_STOPPED, S390_CPU(cs)); ++ } ++ s390_pv_unshare(); ++ s390_pv_perf_clear_reset(); ++} ++ + static void s390_machine_reset(MachineState *machine) + { ++ S390CcwMachineState *ms = S390_CCW_MACHINE(machine); + enum s390_reset reset_type; + CPUState *cs, *t; ++ S390CPU *cpu; + + /* get the reset parameters, reset them once done */ + s390_ipl_get_reset_request(&cs, &reset_type); +@@ -329,9 +400,15 @@ static void s390_machine_reset(MachineState *machine) + /* all CPUs are paused and synchronized at this point */ + s390_cmma_reset(); + ++ cpu = S390_CPU(cs); ++ + switch (reset_type) { + case S390_RESET_EXTERNAL: + case S390_RESET_REIPL: ++ if (s390_is_pv()) { ++ s390_machine_unprotect(ms); ++ } ++ + qemu_devices_reset(); + s390_crypto_reset(); + +@@ -339,22 +416,56 @@ static void s390_machine_reset(MachineState *machine) + run_on_cpu(cs, s390_do_cpu_ipl, RUN_ON_CPU_NULL); + break; + case S390_RESET_MODIFIED_CLEAR: ++ /* ++ * Susbsystem reset needs to be done before we unshare memory ++ * and lose access to VIRTIO structures in guest memory. ++ */ ++ subsystem_reset(); ++ s390_crypto_reset(); ++ s390_pv_prepare_reset(ms); + CPU_FOREACH(t) { + run_on_cpu(t, s390_do_cpu_full_reset, RUN_ON_CPU_NULL); + } +- subsystem_reset(); +- s390_crypto_reset(); + run_on_cpu(cs, s390_do_cpu_load_normal, RUN_ON_CPU_NULL); + break; + case S390_RESET_LOAD_NORMAL: ++ /* ++ * Susbsystem reset needs to be done before we unshare memory ++ * and lose access to VIRTIO structures in guest memory. ++ */ ++ subsystem_reset(); ++ s390_pv_prepare_reset(ms); + CPU_FOREACH(t) { + if (t == cs) { + continue; + } + run_on_cpu(t, s390_do_cpu_reset, RUN_ON_CPU_NULL); + } +- subsystem_reset(); + run_on_cpu(cs, s390_do_cpu_initial_reset, RUN_ON_CPU_NULL); ++ run_on_cpu(cs, s390_do_cpu_load_normal, RUN_ON_CPU_NULL); ++ break; ++ case S390_RESET_PV: /* Subcode 10 */ ++ subsystem_reset(); ++ s390_crypto_reset(); ++ ++ CPU_FOREACH(t) { ++ if (t == cs) { ++ continue; ++ } ++ run_on_cpu(t, s390_do_cpu_full_reset, RUN_ON_CPU_NULL); ++ } ++ run_on_cpu(cs, s390_do_cpu_reset, RUN_ON_CPU_NULL); ++ ++ if (s390_machine_protect(ms)) { ++ s390_machine_inject_pv_error(cs); ++ /* ++ * Continue after the diag308 so the guest knows something ++ * went wrong. ++ */ ++ s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); ++ return; ++ } ++ + run_on_cpu(cs, s390_do_cpu_load_normal, RUN_ON_CPU_NULL); + break; + default: +diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h +new file mode 100644 +index 0000000000..c6cb360f2f +--- /dev/null ++++ b/include/hw/s390x/pv.h +@@ -0,0 +1,55 @@ ++/* ++ * Protected Virtualization header ++ * ++ * Copyright IBM Corp. 2020 ++ * Author(s): ++ * Janosch Frank ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++#ifndef HW_S390_PV_H ++#define HW_S390_PV_H ++ ++#ifdef CONFIG_KVM ++#include "hw/s390x/s390-virtio-ccw.h" ++ ++static inline bool s390_is_pv(void) ++{ ++ static S390CcwMachineState *ccw; ++ Object *obj; ++ ++ if (ccw) { ++ return ccw->pv; ++ } ++ ++ /* we have to bail out for the "none" machine */ ++ obj = object_dynamic_cast(qdev_get_machine(), ++ TYPE_S390_CCW_MACHINE); ++ if (!obj) { ++ return false; ++ } ++ ccw = S390_CCW_MACHINE(obj); ++ return ccw->pv; ++} ++ ++int s390_pv_vm_enable(void); ++void s390_pv_vm_disable(void); ++int s390_pv_set_sec_parms(uint64_t origin, uint64_t length); ++int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); ++void s390_pv_perf_clear_reset(void); ++int s390_pv_verify(void); ++void s390_pv_unshare(void); ++#else /* CONFIG_KVM */ ++static inline bool s390_is_pv(void) { return false; } ++static inline int s390_pv_vm_enable(void) { return 0; } ++static inline void s390_pv_vm_disable(void) {} ++static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; } ++static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { return 0; } ++static inline void s390_pv_perf_clear_reset(void) {} ++static inline int s390_pv_verify(void) { return 0; } ++static inline void s390_pv_unshare(void) {} ++#endif /* CONFIG_KVM */ ++ ++#endif /* HW_S390_PV_H */ +diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h +index 8aa27199c9..cd1dccc6e3 100644 +--- a/include/hw/s390x/s390-virtio-ccw.h ++++ b/include/hw/s390x/s390-virtio-ccw.h +@@ -28,6 +28,7 @@ typedef struct S390CcwMachineState { + /*< public >*/ + bool aes_key_wrap; + bool dea_key_wrap; ++ bool pv; + uint8_t loadparm[8]; + } S390CcwMachineState; + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index c0dd502b84..8f38cd8e6f 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -37,6 +37,7 @@ + #include "sysemu/hw_accel.h" + #include "hw/qdev-properties.h" + #ifndef CONFIG_USER_ONLY ++#include "hw/s390x/pv.h" + #include "hw/boards.h" + #include "sysemu/arch_init.h" + #include "sysemu/sysemu.h" +diff --git a/target/s390x/cpu_features_def.inc.h b/target/s390x/cpu_features_def.inc.h +index 31dff0d84e..60db28351d 100644 +--- a/target/s390x/cpu_features_def.inc.h ++++ b/target/s390x/cpu_features_def.inc.h +@@ -107,6 +107,7 @@ DEF_FEAT(DEFLATE_BASE, "deflate-base", STFL, 151, "Deflate-conversion facility ( + DEF_FEAT(VECTOR_PACKED_DECIMAL_ENH, "vxpdeh", STFL, 152, "Vector-Packed-Decimal-Enhancement Facility") + DEF_FEAT(MSA_EXT_9, "msa9-base", STFL, 155, "Message-security-assist-extension-9 facility (excluding subfunctions)") + DEF_FEAT(ETOKEN, "etoken", STFL, 156, "Etoken facility") ++DEF_FEAT(UNPACK, "unpack", STFL, 161, "Unpack facility") + + /* Features exposed via SCLP SCCB Byte 80 - 98 (bit numbers relative to byte-80) */ + DEF_FEAT(SIE_GSLS, "gsls", SCLP_CONF_CHAR, 40, "SIE: Guest-storage-limit-suppression facility") +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index 8aba6341f9..b2cbefb8cf 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -20,6 +20,8 @@ + #include "sysemu/cpus.h" + #include "hw/s390x/ipl.h" + #include "hw/s390x/s390-virtio-ccw.h" ++#include "hw/s390x/pv.h" ++#include "kvm_s390x.h" + + int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + { +@@ -52,6 +54,10 @@ int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, + uintptr_t ra, bool write) + { ++ /* Handled by the Ultravisor */ ++ if (s390_is_pv()) { ++ return 0; ++ } + if ((r1 & 1) || (addr & ~TARGET_PAGE_MASK)) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return -1; +@@ -67,6 +73,7 @@ static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, + + void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + { ++ bool valid; + CPUState *cs = env_cpu(env); + uint64_t addr = env->regs[r1]; + uint64_t subcode = env->regs[r3]; +@@ -82,6 +89,11 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + return; + } + ++ if (subcode >= DIAG308_PV_SET && !s390_has_feat(S390_FEAT_UNPACK)) { ++ s390_program_interrupt(env, PGM_SPECIFICATION, ra); ++ return; ++ } ++ + switch (subcode) { + case DIAG308_RESET_MOD_CLR: + s390_ipl_reset_request(cs, S390_RESET_MODIFIED_CLEAR); +@@ -94,6 +106,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + s390_ipl_reset_request(cs, S390_RESET_REIPL); + break; + case DIAG308_SET: ++ case DIAG308_PV_SET: + if (diag308_parm_check(env, r1, addr, ra, false)) { + return; + } +@@ -106,7 +119,8 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + + cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); + +- if (!iplb_valid(iplb)) { ++ valid = subcode == DIAG308_PV_SET ? iplb_valid_pv(iplb) : iplb_valid(iplb); ++ if (!valid) { + env->regs[r1 + 1] = DIAG_308_RC_INVALID; + goto out; + } +@@ -117,10 +131,15 @@ out: + g_free(iplb); + return; + case DIAG308_STORE: ++ case DIAG308_PV_STORE: + if (diag308_parm_check(env, r1, addr, ra, true)) { + return; + } +- iplb = s390_ipl_get_iplb(); ++ if (subcode == DIAG308_PV_STORE) { ++ iplb = s390_ipl_get_iplb_pv(); ++ } else { ++ iplb = s390_ipl_get_iplb(); ++ } + if (iplb) { + cpu_physical_memory_write(addr, iplb, be32_to_cpu(iplb->len)); + env->regs[r1 + 1] = DIAG_308_RC_OK; +@@ -128,6 +147,22 @@ out: + env->regs[r1 + 1] = DIAG_308_RC_NO_CONF; + } + return; ++ case DIAG308_PV_START: ++ iplb = s390_ipl_get_iplb_pv(); ++ if (!iplb) { ++ env->regs[r1 + 1] = DIAG_308_RC_NO_PV_CONF; ++ return; ++ } ++ ++ if (kvm_s390_get_hpage_1m()) { ++ error_report("Protected VMs can currently not be backed with " ++ "huge pages"); ++ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; ++ return; ++ } ++ ++ s390_ipl_reset_request(cs, S390_RESET_PV); ++ break; + default: + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + break; +diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c +index c4cd497f85..aa185017a2 100644 +--- a/target/s390x/kvm-stub.c ++++ b/target/s390x/kvm-stub.c +@@ -39,6 +39,11 @@ int kvm_s390_vcpu_interrupt_post_load(S390CPU *cpu) + return 0; + } + ++int kvm_s390_get_hpage_1m(void) ++{ ++ return 0; ++} ++ + int kvm_s390_get_ri(void) + { + return 0; +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 75d82af6fc..9a0be13959 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -321,6 +321,11 @@ void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp) + cap_hpage_1m = 1; + } + ++int kvm_s390_get_hpage_1m(void) ++{ ++ return cap_hpage_1m; ++} ++ + static void ccw_machine_class_foreach(ObjectClass *oc, void *opaque) + { + MachineClass *mc = MACHINE_CLASS(oc); +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index 0b21789796..dea813f450 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -23,6 +23,7 @@ void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code); + int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state); + void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu); + int kvm_s390_vcpu_interrupt_post_load(S390CPU *cpu); ++int kvm_s390_get_hpage_1m(void); + int kvm_s390_get_ri(void); + int kvm_s390_get_gs(void); + int kvm_s390_get_clock(uint8_t *tod_high, uint64_t *tod_clock); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch b/kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch new file mode 100755 index 0000000..b12b458 --- /dev/null +++ b/kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch @@ -0,0 +1,61 @@ +From 8b994757136780998e0dd1d41613d2006c0dbcf6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 4 Aug 2020 10:16:04 -0400 +Subject: [PATCH 4/4] s390x/protvirt: allow to IPL secure guests with + -no-reboot + +RH-Author: Thomas Huth +Message-id: <20200804101604.6259-2-thuth@redhat.com> +Patchwork-id: 98126 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] s390x/protvirt: allow to IPL secure guests with -no-reboot +Bugzilla: 1863034 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +Right now, -no-reboot prevents secure guests from running. This is +correct from an implementation point of view, as we have modeled the +transition from non-secure to secure as a program directed IPL. From +a user perspective, this is not the behavior of least surprise. + +We should implement the IPL into protected mode similar to the +functions that we use for kdump/kexec. In other words, we do not stop +here when -no-reboot is specified on the command line. Like function 0 +or function 1, function 10 is not a classic reboot. For example, it +can only be called once. Before calling it a second time, a real +reboot/reset must happen in-between. So function code 10 is more or +less a state transition reset, but not a "standard" reset or reboot. + +Fixes: 4d226deafc44 ("s390x: protvirt: Support unpack facility") +Signed-off-by: Christian Borntraeger +Reviewed-by: Janosch Frank +Reviewed-by: David Hildenbrand +Acked-by: Viktor Mihajlovski +Message-Id: <20200721103202.30610-1-borntraeger@de.ibm.com> +[CH: tweaked description] +Signed-off-by: Cornelia Huck +(cherry picked from commit d1bb69db4ceb6897ef6a17bf263146b53a123632) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index 586d95b5b6..5b3ea990af 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -624,7 +624,8 @@ void s390_ipl_reset_request(CPUState *cs, enum s390_reset reset_type) + } + } + if (reset_type == S390_RESET_MODIFIED_CLEAR || +- reset_type == S390_RESET_LOAD_NORMAL) { ++ reset_type == S390_RESET_LOAD_NORMAL || ++ reset_type == S390_RESET_PV) { + /* ignore -no-reboot, send no event */ + qemu_system_reset_request(SHUTDOWN_CAUSE_SUBSYSTEM_RESET); + } else { +-- +2.27.0 + diff --git a/kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch b/kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch new file mode 100755 index 0000000..764ceb1 --- /dev/null +++ b/kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch @@ -0,0 +1,92 @@ +From f3594f3d84a7442c194b1b9fd288e7414540ec0f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:20 -0400 +Subject: [PATCH 38/42] s390x: pv: Fix KVM_PV_PREP_RESET command wrapper name +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-39-thuth@redhat.com> +Patchwork-id: 97051 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 38/38] s390x: pv: Fix KVM_PV_PREP_RESET command wrapper name +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Upstream: Merged in https://github.com/cohuck/qemu/tree/s390-next + +s390_pv_perf_clear_reset() is not a very helpful name since that +function needs to be called for a normal and a clear reset via +diag308. + +Let's instead name it s390_pv_prep_reset() which reflects the purpose +of the function a bit better. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20200505124159.24099-1-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f9628f3f6db341751002dac3be18610fa77c01ad) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/pv.c | 2 +- + hw/s390x/s390-virtio-ccw.c | 2 +- + include/hw/s390x/pv.h | 4 ++-- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +index f11868e865..ab3a2482aa 100644 +--- a/hw/s390x/pv.c ++++ b/hw/s390x/pv.c +@@ -88,7 +88,7 @@ int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) + return s390_pv_cmd(KVM_PV_UNPACK, &args); + } + +-void s390_pv_perf_clear_reset(void) ++void s390_pv_prep_reset(void) + { + s390_pv_cmd_exit(KVM_PV_PREP_RESET, NULL); + } +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 07773a12b2..e6ed13b649 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -402,7 +402,7 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms) + s390_cpu_set_state(S390_CPU_STATE_STOPPED, S390_CPU(cs)); + } + s390_pv_unshare(); +- s390_pv_perf_clear_reset(); ++ s390_pv_prep_reset(); + } + + static void s390_machine_reset(MachineState *machine) +diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h +index 522ca6a04e..aee758bc2d 100644 +--- a/include/hw/s390x/pv.h ++++ b/include/hw/s390x/pv.h +@@ -39,7 +39,7 @@ int s390_pv_vm_enable(void); + void s390_pv_vm_disable(void); + int s390_pv_set_sec_parms(uint64_t origin, uint64_t length); + int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); +-void s390_pv_perf_clear_reset(void); ++void s390_pv_prep_reset(void); + int s390_pv_verify(void); + void s390_pv_unshare(void); + void s390_pv_inject_reset_error(CPUState *cs); +@@ -49,7 +49,7 @@ static inline int s390_pv_vm_enable(void) { return 0; } + static inline void s390_pv_vm_disable(void) {} + static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; } + static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { return 0; } +-static inline void s390_pv_perf_clear_reset(void) {} ++static inline void s390_pv_prep_reset(void) {} + static inline int s390_pv_verify(void) { return 0; } + static inline void s390_pv_unshare(void) {} + static inline void s390_pv_inject_reset_error(CPUState *cs) {}; +-- +2.27.0 + diff --git a/kvm-s390x-pv-Fix-diag318-PV-fencing.patch b/kvm-s390x-pv-Fix-diag318-PV-fencing.patch new file mode 100755 index 0000000..4dcb862 --- /dev/null +++ b/kvm-s390x-pv-Fix-diag318-PV-fencing.patch @@ -0,0 +1,114 @@ +From 722078f9fdb766c2f0990145de6732f0c36a63b7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:16 -0500 +Subject: [PATCH 16/18] s390x: pv: Fix diag318 PV fencing + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-13-thuth@redhat.com> +Patchwork-id: 99509 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 12/12] s390x: pv: Fix diag318 PV fencing +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Diag318 fencing needs to be determined on the current VM PV state and +not on the state that the VM has when we create the CPU model. + +Fixes: fabdada935 ("s390: guest support for diagnose 0x318") +Reported-by: Marc Hartmayer +Signed-off-by: Janosch Frank +Tested-by: Marc Hartmayer +Reviewed-by: Christian Borntraeger +Reviewed-by: Collin Walling +Acked-by: David Hildenbrand +Message-Id: <20201022103135.126033-3-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 3ded270a2697852a71961b45291519ae044f25e3) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu_features.c | 5 +++++ + target/s390x/cpu_features.h | 4 ++++ + target/s390x/cpu_models.c | 4 ++++ + target/s390x/kvm.c | 3 +-- + 4 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/cpu_features.c b/target/s390x/cpu_features.c +index 9f817e3cfa7..e5cdf232607 100644 +--- a/target/s390x/cpu_features.c ++++ b/target/s390x/cpu_features.c +@@ -14,6 +14,7 @@ + #include "qemu/osdep.h" + #include "qemu/module.h" + #include "cpu_features.h" ++#include "hw/s390x/pv.h" + + #define DEF_FEAT(_FEAT, _NAME, _TYPE, _BIT, _DESC) \ + [S390_FEAT_##_FEAT] = { \ +@@ -105,6 +106,10 @@ void s390_fill_feat_block(const S390FeatBitmap features, S390FeatType type, + } + feat = find_next_bit(features, S390_FEAT_MAX, feat + 1); + } ++ ++ if (type == S390_FEAT_TYPE_SCLP_FAC134 && s390_is_pv()) { ++ clear_be_bit(s390_feat_def(S390_FEAT_DIAG_318)->bit, data); ++ } + } + + void s390_add_from_feat_block(S390FeatBitmap features, S390FeatType type, +diff --git a/target/s390x/cpu_features.h b/target/s390x/cpu_features.h +index f74f7fc3a11..d3c685a04c8 100644 +--- a/target/s390x/cpu_features.h ++++ b/target/s390x/cpu_features.h +@@ -81,6 +81,10 @@ const S390FeatGroupDef *s390_feat_group_def(S390FeatGroup group); + + #define BE_BIT_NR(BIT) (BIT ^ (BITS_PER_LONG - 1)) + ++static inline void clear_be_bit(unsigned int bit_nr, uint8_t *array) ++{ ++ array[bit_nr / 8] &= ~(0x80 >> (bit_nr % 8)); ++} + static inline void set_be_bit(unsigned int bit_nr, uint8_t *array) + { + array[bit_nr / 8] |= 0x80 >> (bit_nr % 8); +diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c +index bf6a3faba9e..d489923cb8a 100644 +--- a/target/s390x/cpu_models.c ++++ b/target/s390x/cpu_models.c +@@ -29,6 +29,7 @@ + #include "hw/pci/pci.h" + #endif + #include "qapi/qapi-commands-machine-target.h" ++#include "hw/s390x/pv.h" + + #define CPUDEF_INIT(_type, _gen, _ec_ga, _mha_pow, _hmfai, _name, _desc) \ + { \ +@@ -238,6 +239,9 @@ bool s390_has_feat(S390Feat feat) + } + return 0; + } ++ if (feat == S390_FEAT_DIAG_318 && s390_is_pv()) { ++ return false; ++ } + return test_bit(feat, cpu->model->features); + } + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index e5e190d21c9..6edb52f6d25 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2483,8 +2483,7 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + */ + set_bit(S390_FEAT_EXTENDED_LENGTH_SCCB, model->features); + +- /* DIAGNOSE 0x318 is not supported under protected virtualization */ +- if (!s390_is_pv() && kvm_check_extension(kvm_state, KVM_CAP_S390_DIAG318)) { ++ if (kvm_check_extension(kvm_state, KVM_CAP_S390_DIAG318)) { + set_bit(S390_FEAT_DIAG_318, model->features); + } + +-- +2.27.0 + diff --git a/kvm-s390x-pv-Remove-sclp-boundary-checks.patch b/kvm-s390x-pv-Remove-sclp-boundary-checks.patch new file mode 100755 index 0000000..51ceb48 --- /dev/null +++ b/kvm-s390x-pv-Remove-sclp-boundary-checks.patch @@ -0,0 +1,57 @@ +From cf3d958b14e21fde929e67262b6e192592d95359 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:15 -0500 +Subject: [PATCH 15/18] s390x: pv: Remove sclp boundary checks + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-12-thuth@redhat.com> +Patchwork-id: 99508 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 11/12] s390x: pv: Remove sclp boundary checks +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The SCLP boundary cross check is done by the Ultravisor for a +protected guest, hence we don't need to do it. As QEMU doesn't get a +valid SCCB address in protected mode this is even problematic and can +lead to QEMU reporting a false boundary cross error. + +Fixes: db13387ca0 ("s390/sclp: rework sclp boundary checks") +Reported-by: Marc Hartmayer +Signed-off-by: Janosch Frank +Tested-by: Marc Hartmayer +Reviewed-by: Christian Borntraeger +Reviewed-by: Thomas Huth +Reviewed-by: Collin Walling +Acked-by: Halil Pasic +Acked-by: David Hildenbrand +Message-Id: <20201022103135.126033-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 3df4843d0e612a3c838e8d94c3e9c24520f2e680) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 2931046f456..03f847b2c8a 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -285,11 +285,6 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length), code)) { +- work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); +- goto out_write; +- } +- + sclp_c->execute(sclp, work_sccb, code); + out_write: + s390_cpu_pv_mem_write(env_archcpu(env), 0, work_sccb, +-- +2.27.0 + diff --git a/kvm-s390x-pv-Retry-ioctls-on-EINTR.patch b/kvm-s390x-pv-Retry-ioctls-on-EINTR.patch new file mode 100755 index 0000000..65208c7 --- /dev/null +++ b/kvm-s390x-pv-Retry-ioctls-on-EINTR.patch @@ -0,0 +1,57 @@ +From 1678288d945906d83d7adae109b842080aebaf19 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:18 -0400 +Subject: [PATCH 36/42] s390x/pv: Retry ioctls on -EINTR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-37-thuth@redhat.com> +Patchwork-id: 97055 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 36/38] s390x/pv: Retry ioctls on -EINTR +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +PV_ENABLE (and maybe others) might return -EINTR when a signal is +pending. See the Linux kernel patch "s390/gmap: return proper error code +on ksm unsharing" for details. Let us retry the ioctl in that case. + +Fixes: c3347ed0d2ee ("s390x: protvirt: Support unpack facility") +Reported-by: Marc Hartmayer +Acked-by: Janosch Frank +Tested-by: Marc Hartmayer +Signed-off-by: Christian Borntraeger +Message-Id: <20200327124616.34866-1-borntraeger@de.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit e8d12a55f6d3e577455b02f15907c460578c689b) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/pv.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +index a40a844806..cb0dce4a4f 100644 +--- a/hw/s390x/pv.c ++++ b/hw/s390x/pv.c +@@ -23,7 +23,11 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) + .cmd = cmd, + .data = (uint64_t)data, + }; +- int rc = kvm_vm_ioctl(kvm_state, KVM_S390_PV_COMMAND, &pv_cmd); ++ int rc; ++ ++ do { ++ rc = kvm_vm_ioctl(kvm_state, KVM_S390_PV_COMMAND, &pv_cmd); ++ } while (rc == -EINTR); + + if (rc) { + error_report("KVM PV command %d (%s) failed: header rc %x rrc %x " +-- +2.27.0 + diff --git a/kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch b/kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch new file mode 100755 index 0000000..e78f4da --- /dev/null +++ b/kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch @@ -0,0 +1,150 @@ +From 0db8d909a2f3c53d12b0ae12307965f9a8193dbc Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:19 -0400 +Subject: [PATCH 37/42] s390x/s390-virtio-ccw: Fix build on systems without KVM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-38-thuth@redhat.com> +Patchwork-id: 97047 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 37/38] s390x/s390-virtio-ccw: Fix build on systems without KVM +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +linux/kvm.h is not available on all platforms. Let us move +s390_machine_inject_pv_error into pv.c as it uses KVM structures. +Also rename the function to s390_pv_inject_reset_error. + +While at it, ipl.h needs an include for "exec/address-spaces.h" +as it uses address_space_memory. + +Fixes: c3347ed0d2ee ("s390x: protvirt: Support unpack facility") +Reported-by: Bruce Rogers +Signed-off-by: Christian Borntraeger +Message-Id: <20200406100158.5940-2-borntraeger@de.ibm.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Cornelia Huck +(cherry picked from commit fbc1384ccd48fa7c0c38f950adf7992a4fb6042e) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.h | 1 + + hw/s390x/pv.c | 11 +++++++++++ + hw/s390x/s390-virtio-ccw.c | 12 +----------- + include/hw/s390x/pv.h | 3 +++ + 4 files changed, 16 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index 89b3044d7a..53cc9eb5ac 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -14,6 +14,7 @@ + #define HW_S390_IPL_H + + #include "cpu.h" ++#include "exec/address-spaces.h" + #include "hw/qdev-core.h" + + struct IPLBlockPVComp { +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +index cb0dce4a4f..f11868e865 100644 +--- a/hw/s390x/pv.c ++++ b/hw/s390x/pv.c +@@ -13,8 +13,10 @@ + + #include + ++#include "cpu.h" + #include "qemu/error-report.h" + #include "sysemu/kvm.h" ++#include "hw/s390x/ipl.h" + #include "hw/s390x/pv.h" + + static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) +@@ -100,3 +102,12 @@ void s390_pv_unshare(void) + { + s390_pv_cmd_exit(KVM_PV_UNSHARE_ALL, NULL); + } ++ ++void s390_pv_inject_reset_error(CPUState *cs) ++{ ++ int r1 = (cs->kvm_run->s390_sieic.ipa & 0x00f0) >> 4; ++ CPUS390XState *env = &S390_CPU(cs)->env; ++ ++ /* Report that we are unable to enter protected mode */ ++ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; ++} +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index c08e42bda1..07773a12b2 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -44,7 +44,6 @@ + #include "sysemu/sysemu.h" + #include "sysemu/balloon.h" + #include "hw/s390x/pv.h" +-#include + #include "migration/blocker.h" + + static Error *pv_mig_blocker; +@@ -391,15 +390,6 @@ out_err: + return rc; + } + +-static void s390_machine_inject_pv_error(CPUState *cs) +-{ +- int r1 = (cs->kvm_run->s390_sieic.ipa & 0x00f0) >> 4; +- CPUS390XState *env = &S390_CPU(cs)->env; +- +- /* Report that we are unable to enter protected mode */ +- env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; +-} +- + static void s390_pv_prepare_reset(S390CcwMachineState *ms) + { + CPUState *cs; +@@ -485,7 +475,7 @@ static void s390_machine_reset(MachineState *machine) + run_on_cpu(cs, s390_do_cpu_reset, RUN_ON_CPU_NULL); + + if (s390_machine_protect(ms)) { +- s390_machine_inject_pv_error(cs); ++ s390_pv_inject_reset_error(cs); + /* + * Continue after the diag308 so the guest knows something + * went wrong. +diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h +index c6cb360f2f..522ca6a04e 100644 +--- a/include/hw/s390x/pv.h ++++ b/include/hw/s390x/pv.h +@@ -13,6 +13,7 @@ + #define HW_S390_PV_H + + #ifdef CONFIG_KVM ++#include "cpu.h" + #include "hw/s390x/s390-virtio-ccw.h" + + static inline bool s390_is_pv(void) +@@ -41,6 +42,7 @@ int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); + void s390_pv_perf_clear_reset(void); + int s390_pv_verify(void); + void s390_pv_unshare(void); ++void s390_pv_inject_reset_error(CPUState *cs); + #else /* CONFIG_KVM */ + static inline bool s390_is_pv(void) { return false; } + static inline int s390_pv_vm_enable(void) { return 0; } +@@ -50,6 +52,7 @@ static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { + static inline void s390_pv_perf_clear_reset(void) {} + static inline int s390_pv_verify(void) { return 0; } + static inline void s390_pv_unshare(void) {} ++static inline void s390_pv_inject_reset_error(CPUState *cs) {}; + #endif /* CONFIG_KVM */ + + #endif /* HW_S390_PV_H */ +-- +2.27.0 + diff --git a/kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch b/kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch new file mode 100755 index 0000000..f90dc30 --- /dev/null +++ b/kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch @@ -0,0 +1,52 @@ +From fa4e13a01ecc316cc43c1f39490330b94c910bc1 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Mon, 14 Dec 2020 18:29:49 -0500 +Subject: [PATCH 04/14] s390x/s390-virtio-ccw: Reset PCI devices during + subsystem reset + +RH-Author: Thomas Huth +Message-id: <20201214182949.35712-2-thuth@redhat.com> +Patchwork-id: 100440 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] s390x/s390-virtio-ccw: Reset PCI devices during subsystem reset +Bugzilla: 1905386 +RH-Acked-by: Danilo de Paula +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +From: Matthew Rosato + +Currently, a subsystem reset event leaves PCI devices enabled, causing +issues post-reset in the guest (an example would be after a kexec). These +devices need to be reset during a subsystem reset, allowing them to be +properly re-enabled afterwards. Add the S390 PCI host bridge to the list +of qdevs to be reset during subsystem reset. + +Signed-off-by: Matthew Rosato +Reviewed-by: Eric Farman +Acked-by: Halil Pasic +Acked-by: Christian Borntraeger +Cc: qemu-stable@nongnu.org +Message-Id: <1602767767-32713-1-git-send-email-mjrosato@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit db08244a3a7ec312dfed3fd9b88e114281215458) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 5905d2b7adc..5b3d07f55c4 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -103,6 +103,7 @@ static const char *const reset_dev_types[] = { + "s390-sclp-event-facility", + "s390-flic", + "diag288", ++ TYPE_S390_PCI_HOST_BRIDGE, + }; + + static void subsystem_reset(void) +-- +2.27.0 + diff --git a/kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch b/kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch new file mode 100755 index 0000000..5a38a88 --- /dev/null +++ b/kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch @@ -0,0 +1,90 @@ +From 8b06cba98e37b9c50e2a9deb1567d8cf4e1ba2b6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:05 -0500 +Subject: [PATCH 05/18] s390x/sclp.c: remove unneeded label in + sclp_service_call() + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-2-thuth@redhat.com> +Patchwork-id: 99497 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 01/12] s390x/sclp.c: remove unneeded label in sclp_service_call() +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Daniel Henrique Barboza + +'out' label can be replaced by 'return' with the appropriate +value. The 'r' integer, which is used solely to set the +return value for this label, can also be removed. + +CC: Cornelia Huck +CC: Halil Pasic +CC: Christian Borntraeger +Signed-off-by: Daniel Henrique Barboza +Reviewed-by: Thomas Huth +Message-Id: <20200106182425.20312-39-danielhb413@gmail.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit e6de76fca48012348d8c81b1399c861f444bd4a4) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 1c380a49cc7..d8ae207731f 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -241,24 +241,20 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + { + SCLPDevice *sclp = get_sclp_device(); + SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); +- int r = 0; + SCCB work_sccb; + + hwaddr sccb_len = sizeof(SCCB); + + /* first some basic checks on program checks */ + if (env->psw.mask & PSW_MASK_PSTATE) { +- r = -PGM_PRIVILEGED; +- goto out; ++ return -PGM_PRIVILEGED; + } + if (cpu_physical_memory_is_io(sccb)) { +- r = -PGM_ADDRESSING; +- goto out; ++ return -PGM_ADDRESSING; + } + if ((sccb & ~0x1fffUL) == 0 || (sccb & ~0x1fffUL) == env->psa + || (sccb & ~0x7ffffff8UL) != 0) { +- r = -PGM_SPECIFICATION; +- goto out; ++ return -PGM_SPECIFICATION; + } + + /* +@@ -270,8 +266,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + + /* Valid sccb sizes */ + if (be16_to_cpu(work_sccb.h.length) < sizeof(SCCBHeader)) { +- r = -PGM_SPECIFICATION; +- goto out; ++ return -PGM_SPECIFICATION; + } + + if (!sclp_command_code_valid(code)) { +@@ -291,8 +286,7 @@ out_write: + + sclp_c->service_interrupt(sclp, sccb); + +-out: +- return r; ++ return 0; + } + + static void service_interrupt(SCLPDevice *sclp, uint32_t sccb) +-- +2.27.0 + diff --git a/kvm-s390x-sigp-Fix-sense-running-reporting.patch b/kvm-s390x-sigp-Fix-sense-running-reporting.patch new file mode 100755 index 0000000..7143964 --- /dev/null +++ b/kvm-s390x-sigp-Fix-sense-running-reporting.patch @@ -0,0 +1,49 @@ +From a2befb24c10f58ce6c27d242f3b88afee1f77ec8 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 7 Jul 2020 09:35:31 -0400 +Subject: [PATCH 2/4] s390x: sigp: Fix sense running reporting + +RH-Author: Thomas Huth +Message-id: <20200707093532.22456-2-thuth@redhat.com> +Patchwork-id: 97920 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/2] s390x: sigp: Fix sense running reporting +Bugzilla: 1854092 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The logic was inverted and reported running if the cpu was stopped. +Let's fix that. + +Signed-off-by: Janosch Frank +Fixes: d1b468bc8869 ("s390x/tcg: implement SIGP SENSE RUNNING STATUS") +Reviewed-by: David Hildenbrand +Message-Id: <20200124134818.9981-1-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 4103500e2fa934a6995e4cedab37423e606715bf) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/sigp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index 727875bb4a..c604f17710 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -348,9 +348,9 @@ static void sigp_sense_running(S390CPU *dst_cpu, SigpInfo *si) + + /* If halted (which includes also STOPPED), it is not running */ + if (CPU(dst_cpu)->halted) { +- si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; +- } else { + set_sigp_status(si, SIGP_STAT_NOT_RUNNING); ++ } else { ++ si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } + } + +-- +2.27.0 + diff --git a/kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch b/kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch new file mode 100755 index 0000000..b6ac314 --- /dev/null +++ b/kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch @@ -0,0 +1,57 @@ +From 0c85e86077b42547034ec6e8330a3e61d79b97ee Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 7 Jul 2020 09:35:32 -0400 +Subject: [PATCH 3/4] s390x/tcg: clear local interrupts on reset normal + +RH-Author: Thomas Huth +Message-id: <20200707093532.22456-3-thuth@redhat.com> +Patchwork-id: 97919 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/2] s390x/tcg: clear local interrupts on reset normal +Bugzilla: 1854092 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Cornelia Huck + +We neglected to clean up pending interrupts and emergency signals; +fix that. + +Message-Id: <20191206135404.16051-1-cohuck@redhat.com> +Signed-off-by: Cornelia Huck +Reviewed-by: David Hildenbrand +(cherry picked from commit bcf88d56efec4ffc153bbe98d11b689a5ebe1a91) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index edf8391504..a48e655c4d 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -98,10 +98,6 @@ struct CPUS390XState { + + uint64_t cregs[16]; /* control registers */ + +- int pending_int; +- uint16_t external_call_addr; +- DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); +- + uint64_t ckc; + uint64_t cputm; + uint32_t todpr; +@@ -117,6 +113,10 @@ struct CPUS390XState { + struct {} start_normal_reset_fields; + uint8_t riccb[64]; /* runtime instrumentation control */ + ++ int pending_int; ++ uint16_t external_call_addr; ++ DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); ++ + /* Fields up to this point are cleared by a CPU reset */ + struct {} end_reset_fields; + +-- +2.27.0 + diff --git a/kvm-s390x.conf b/kvm-s390x.conf new file mode 100755 index 0000000..d82b818 --- /dev/null +++ b/kvm-s390x.conf @@ -0,0 +1,19 @@ +# User changes in this file are preserved across upgrades. +# +# Setting "modprobe kvm nested=1" only enables Nested Virtualization until +# the next reboot or module reload. Uncomment the option below to enable +# the feature permanently. +# +#options kvm nested=1 +# +# +# Setting "modprobe kvm hpage=1" only enables Huge Page Backing (1MB) +# support until the next reboot or module reload. Uncomment the option +# below to enable the feature permanently. +# +# Note: - Incompatible with "nested=1". Loading the module will fail. +# - Dirty page logging will be performed on a 1MB (not 4KB) basis, +# which can result in a lot of data having to be transferred during +# migration, and therefore taking very long to converge. +# +#options kvm hpage=1 diff --git a/kvm-scsi-make-io_timeout-configurable.patch b/kvm-scsi-make-io_timeout-configurable.patch new file mode 100755 index 0000000..a073728 --- /dev/null +++ b/kvm-scsi-make-io_timeout-configurable.patch @@ -0,0 +1,177 @@ +From 2903ab6d961b2165f3cbfb786cd3be59a407a6b4 Mon Sep 17 00:00:00 2001 +From: Hannes Reinecke +Date: Mon, 16 Nov 2020 19:31:13 +0100 +Subject: [PATCH] scsi: make io_timeout configurable + +RH-Author: Paolo Bonzini +RH-MergeRequest: 40: scsi: make io_timeout configurable +RH-Commit: [1/1] 147c4b7bac867c708c1905e98c4f9329ab4ef838 +RH-Bugzilla: 1994041 +RH-Acked-by: Jon Maloy +RH-Acked-by: Thomas Huth +RH-Acked-by: Laszlo Ersek + +The current code sets an infinite timeout on SG_IO requests, +causing the guest to stall if the host experiences a frame +loss. +This patch adds an 'io_timeout' parameter for SCSIDevice to +make the SG_IO timeout configurable, and also shortens the +default timeout to 30 seconds to avoid infinite stalls. + +Signed-off-by: Hannes Reinecke +Message-Id: <20201116183114.55703-3-hare@suse.de> +Signed-off-by: Paolo Bonzini +(cherry picked from commit c9b6609b69facad0cc5425d4fa7934c33d7f2e91) +--- + hw/scsi/scsi-disk.c | 6 ++++-- + hw/scsi/scsi-generic.c | 17 +++++++++++------ + include/hw/scsi/scsi.h | 4 +++- + 3 files changed, 18 insertions(+), 9 deletions(-) + +diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c +index e44c61eeb4..5cb5fd35bd 100644 +--- a/hw/scsi/scsi-disk.c ++++ b/hw/scsi/scsi-disk.c +@@ -2610,7 +2610,7 @@ static int get_device_type(SCSIDiskState *s) + cmd[4] = sizeof(buf); + + ret = scsi_SG_IO_FROM_DEV(s->qdev.conf.blk, cmd, sizeof(cmd), +- buf, sizeof(buf)); ++ buf, sizeof(buf), s->qdev.io_timeout); + if (ret < 0) { + return -1; + } +@@ -2771,7 +2771,7 @@ static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req, + /* The rest is as in scsi-generic.c. */ + io_header->mx_sb_len = sizeof(r->req.sense); + io_header->sbp = r->req.sense; +- io_header->timeout = UINT_MAX; ++ io_header->timeout = s->qdev.io_timeout * 1000; + io_header->usr_ptr = r; + io_header->flags |= SG_FLAG_DIRECT_IO; + +@@ -3089,6 +3089,8 @@ static Property scsi_block_properties[] = { + DEFAULT_MAX_IO_SIZE), + DEFINE_PROP_INT32("scsi_version", SCSIDiskState, qdev.default_scsi_version, + -1), ++ DEFINE_PROP_UINT32("io_timeout", SCSIDiskState, qdev.io_timeout, ++ DEFAULT_IO_TIMEOUT), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c +index e7798ebcd0..3a383fe4d6 100644 +--- a/hw/scsi/scsi-generic.c ++++ b/hw/scsi/scsi-generic.c +@@ -114,6 +114,8 @@ static int execute_command(BlockBackend *blk, + SCSIGenericReq *r, int direction, + BlockCompletionFunc *complete) + { ++ SCSIDevice *s = r->req.dev; ++ + r->io_header.interface_id = 'S'; + r->io_header.dxfer_direction = direction; + r->io_header.dxferp = r->buf; +@@ -122,7 +124,7 @@ static int execute_command(BlockBackend *blk, + r->io_header.cmd_len = r->req.cmd.len; + r->io_header.mx_sb_len = sizeof(r->req.sense); + r->io_header.sbp = r->req.sense; +- r->io_header.timeout = MAX_UINT; ++ r->io_header.timeout = s->io_timeout * 1000; + r->io_header.usr_ptr = r; + r->io_header.flags |= SG_FLAG_DIRECT_IO; + +@@ -503,7 +505,7 @@ static int read_naa_id(const uint8_t *p, uint64_t *p_wwn) + } + + int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size, +- uint8_t *buf, uint8_t buf_size) ++ uint8_t *buf, uint8_t buf_size, uint32_t timeout) + { + sg_io_hdr_t io_header; + uint8_t sensebuf[8]; +@@ -518,7 +520,7 @@ int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size, + io_header.cmd_len = cmd_size; + io_header.mx_sb_len = sizeof(sensebuf); + io_header.sbp = sensebuf; +- io_header.timeout = 6000; /* XXX */ ++ io_header.timeout = timeout * 1000; + + ret = blk_ioctl(blk, SG_IO, &io_header); + if (ret < 0 || io_header.driver_status || io_header.host_status) { +@@ -548,7 +550,7 @@ static void scsi_generic_set_vpd_bl_emulation(SCSIDevice *s) + cmd[4] = sizeof(buf); + + ret = scsi_SG_IO_FROM_DEV(s->conf.blk, cmd, sizeof(cmd), +- buf, sizeof(buf)); ++ buf, sizeof(buf), s->io_timeout); + if (ret < 0) { + /* + * Do not assume anything if we can't retrieve the +@@ -584,7 +586,7 @@ static void scsi_generic_read_device_identification(SCSIDevice *s) + cmd[4] = sizeof(buf); + + ret = scsi_SG_IO_FROM_DEV(s->conf.blk, cmd, sizeof(cmd), +- buf, sizeof(buf)); ++ buf, sizeof(buf), s->io_timeout); + if (ret < 0) { + return; + } +@@ -635,7 +637,7 @@ static int get_stream_blocksize(BlockBackend *blk) + cmd[0] = MODE_SENSE; + cmd[4] = sizeof(buf); + +- ret = scsi_SG_IO_FROM_DEV(blk, cmd, sizeof(cmd), buf, sizeof(buf)); ++ ret = scsi_SG_IO_FROM_DEV(blk, cmd, sizeof(cmd), buf, sizeof(buf), 6); + if (ret < 0) { + return -1; + } +@@ -725,6 +727,7 @@ static void scsi_generic_realize(SCSIDevice *s, Error **errp) + + /* Only used by scsi-block, but initialize it nevertheless to be clean. */ + s->default_scsi_version = -1; ++ s->io_timeout = DEFAULT_IO_TIMEOUT; + scsi_generic_read_device_inquiry(s); + } + +@@ -748,6 +751,8 @@ static SCSIRequest *scsi_new_request(SCSIDevice *d, uint32_t tag, uint32_t lun, + static Property scsi_generic_properties[] = { + DEFINE_PROP_DRIVE("drive", SCSIDevice, conf.blk), + DEFINE_PROP_BOOL("share-rw", SCSIDevice, conf.share_rw, false), ++ DEFINE_PROP_UINT32("io_timeout", SCSIDevice, io_timeout, ++ DEFAULT_IO_TIMEOUT), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h +index 332ef602f4..ae0724a32d 100644 +--- a/include/hw/scsi/scsi.h ++++ b/include/hw/scsi/scsi.h +@@ -17,6 +17,7 @@ typedef struct SCSIReqOps SCSIReqOps; + + #define SCSI_SENSE_BUF_SIZE_OLD 96 + #define SCSI_SENSE_BUF_SIZE 252 ++#define DEFAULT_IO_TIMEOUT 30 + + struct SCSIRequest { + SCSIBus *bus; +@@ -88,6 +89,7 @@ struct SCSIDevice + uint64_t port_wwn; + int scsi_version; + int default_scsi_version; ++ uint32_t io_timeout; + bool needs_vpd_bl_emulation; + bool hba_supports_iothread; + }; +@@ -192,7 +194,7 @@ void scsi_device_unit_attention_reported(SCSIDevice *dev); + void scsi_generic_read_device_inquiry(SCSIDevice *dev); + int scsi_device_get_sense(SCSIDevice *dev, uint8_t *buf, int len, bool fixed); + int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size, +- uint8_t *buf, uint8_t buf_size); ++ uint8_t *buf, uint8_t buf_size, uint32_t timeout); + SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int target, int lun); + + /* scsi-generic.c. */ +-- +2.18.2 + diff --git a/kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch b/kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch new file mode 100755 index 0000000..189be7e --- /dev/null +++ b/kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch @@ -0,0 +1,79 @@ +From 08dc2a4dc481916fae9597220ad0faf3f6ed70c1 Mon Sep 17 00:00:00 2001 +From: Eduardo Otubo +Date: Mon, 16 Nov 2020 15:15:38 -0500 +Subject: [PATCH 1/5] seccomp: fix killing of whole process instead of thread +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eduardo Otubo +Message-id: <20201116151538.22254-1-otubo@redhat.com> +Patchwork-id: 99654 +O-Subject: [RHEL-8.3.0/RHEL-8.4.0 qemu-kvm PATCH] seccomp: fix killing of whole process instead of thread +Bugzilla: 1880546 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Danilo de Paula +RH-Acked-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1890885 +BRANCH: rhel-8.3.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=1890885 + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1880546 +BRANCH: rhel-8.4.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=33125023 + +From: Daniel P. Berrangé + +Back in 2018 we introduced support for killing the whole QEMU process +instead of just one thread, when a seccomp rule is violated: + + commit bda08a5764d470f101fa38635d30b41179a313e1 + Author: Marc-André Lureau + Date: Wed Aug 22 19:02:48 2018 +0200 + + seccomp: prefer SCMP_ACT_KILL_PROCESS if available + +Fast forward a year and we introduced a patch to avoid killing the +process for resource control syscalls tickled by Mesa. + + commit 9a1565a03b79d80b236bc7cc2dbce52a2ef3a1b8 + Author: Daniel P. Berrangé + Date: Wed Mar 13 09:49:03 2019 +0000 + + seccomp: don't kill process for resource control syscalls + +Unfortunately a logic bug effectively reverted the first commit +mentioned so that we go back to only killing the thread, not the whole +process. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Acked-by: Eduardo Otubo +(cherry picked from commit e474e3aacf4276eb0781d11c45e2fab996f9dc56) +Signed-off-by: Eduardo Otubo +Signed-off-by: Danilo C. L. de Paula +--- + qemu-seccomp.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/qemu-seccomp.c b/qemu-seccomp.c +index e0a1829b3dd..8325ecb766e 100644 +--- a/qemu-seccomp.c ++++ b/qemu-seccomp.c +@@ -136,8 +136,9 @@ static uint32_t qemu_seccomp_get_action(int set) + + if (qemu_seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &action) == 0) { + kill_process = 1; ++ } else { ++ kill_process = 0; + } +- kill_process = 0; + } + if (kill_process == 1) { + return SCMP_ACT_KILL_PROCESS; +-- +2.27.0 + diff --git a/kvm-setup b/kvm-setup new file mode 100755 index 0000000..3bfedf6 --- /dev/null +++ b/kvm-setup @@ -0,0 +1,49 @@ +#! /bin/bash + +kvm_setup_powerpc () { + if grep '^platform[[:space:]]*:[[:space:]]*PowerNV' /proc/cpuinfo > /dev/null; then + # PowerNV platform, which is KVM HV capable + + if [ -z "$SUBCORES" ]; then + SUBCORES=1 + fi + + # Step 1. Load the KVM HVmodule + if ! modprobe -b kvm_hv; then + return + fi + + # On POWER8 a host core can only run threads of a single + # guest, meaning that SMT must be disabled on the host in + # order to run KVM guests. (Also applieds to POWER7, but we + # don't support that). + # + # POWER9 doesn't have this limitation (though it will for hash + # guests on radix host when that's implemented). So, only set + # up subcores and disable SMT for POWER*. + if grep '^cpu[[:space:]]*:[[:space:]]*POWER8' /proc/cpuinfo > /dev/null; then + # Step 2. Configure subcore mode + /usr/sbin/ppc64_cpu --subcores-per-core=$SUBCORES + + # Step 3. Disable SMT (multithreading) + /usr/sbin/ppc64_cpu --smt=off + fi + fi +} + +kvm_setup_s390x () { + if grep -q "^features.*sie" /proc/cpuinfo; then + modprobe kvm + fi +} + +case $(uname -m) in + ppc64|ppc64le) + kvm_setup_powerpc + ;; + s390x) + kvm_setup_s390x + ;; +esac + +exit 0 diff --git a/kvm-setup.service b/kvm-setup.service new file mode 100755 index 0000000..9c4bf97 --- /dev/null +++ b/kvm-setup.service @@ -0,0 +1,14 @@ +[Unit] +Description=Perform system configuration to prepare system to run KVM guests +# Offlining CPUs can cause irqbalance to throw warnings if it's running +Before=irqbalance.service +# libvirtd reads CPU topology at startup, so change it before +Before=libvirtd.service + +[Service] +Type=oneshot +EnvironmentFile=-/etc/sysconfig/kvm +ExecStart=/usr/lib/systemd/kvm-setup + +[Install] +WantedBy=multi-user.target diff --git a/kvm-slirp-check-pkt_len-before-reading-protocol-header.patch b/kvm-slirp-check-pkt_len-before-reading-protocol-header.patch new file mode 100755 index 0000000..43c44ea --- /dev/null +++ b/kvm-slirp-check-pkt_len-before-reading-protocol-header.patch @@ -0,0 +1,72 @@ +From 2bfa25e55c0a49bc079e5769db2199989eda7745 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Fri, 11 Dec 2020 00:59:26 -0500 +Subject: [PATCH 03/14] slirp: check pkt_len before reading protocol header +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20201211005926.618830-2-jmaloy@redhat.com> +Patchwork-id: 100398 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] slirp: check pkt_len before reading protocol header +Bugzilla: 1902237 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Marc-André Lureau + +While processing ARP/NCSI packets in 'arp_input' or 'ncsi_input' +routines, ensure that pkt_len is large enough to accommodate the +respective protocol headers, lest it should do an OOB access. +Add check to avoid it. + +CVE-2020-29129 CVE-2020-29130 + QEMU: slirp: out-of-bounds access while processing ARP/NCSI packets + -> https://www.openwall.com/lists/oss-security/2020/11/27/1 + +Reported-by: Qiuhao Li +Signed-off-by: Prasad J Pandit +Message-Id: <20201126135706.273950-1-ppandit@redhat.com> +Reviewed-by: Marc-André Lureau + +(cherry picked from libslirp commit 2e1dcbc0c2af64fcb17009eaf2ceedd81be2b27f) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ncsi.c | 4 ++++ + slirp/src/slirp.c | 4 ++++ + 2 files changed, 8 insertions(+) + +diff --git a/slirp/src/ncsi.c b/slirp/src/ncsi.c +index 6864b735db4..251c0d2bfbb 100644 +--- a/slirp/src/ncsi.c ++++ b/slirp/src/ncsi.c +@@ -147,6 +147,10 @@ void ncsi_input(Slirp *slirp, const uint8_t *pkt, int pkt_len) + uint32_t checksum; + uint32_t *pchecksum; + ++ if (pkt_len < ETH_HLEN + sizeof(struct ncsi_pkt_hdr)) { ++ return; /* packet too short */ ++ } ++ + memset(ncsi_reply, 0, sizeof(ncsi_reply)); + + memset(reh->h_dest, 0xff, ETH_ALEN); +diff --git a/slirp/src/slirp.c b/slirp/src/slirp.c +index b0194cb32bb..86b0f52d923 100644 +--- a/slirp/src/slirp.c ++++ b/slirp/src/slirp.c +@@ -700,6 +700,10 @@ static void arp_input(Slirp *slirp, const uint8_t *pkt, int pkt_len) + return; + } + ++ if (pkt_len < ETH_HLEN + sizeof(struct slirp_arphdr)) { ++ return; /* packet too short */ ++ } ++ + ar_op = ntohs(ah->ar_op); + switch (ar_op) { + case ARPOP_REQUEST: +-- +2.27.0 + diff --git a/kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch b/kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch new file mode 100755 index 0000000..6d8dfe1 --- /dev/null +++ b/kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch @@ -0,0 +1,77 @@ +From 0f659af4870f151e25a7d2184b9a383bff58e3ba Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 17 Jan 2020 12:07:57 +0100 +Subject: [PATCH 2/4] slirp: use correct size while emulating IRC commands +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200117120758.1076549-3-marcandre.lureau@redhat.com> +Patchwork-id: 93400 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm + RHEL-AV-8.2.0 qemu-kvm PATCH 2/3] slirp: use correct size while emulating IRC commands +Bugzilla: 1791568 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +From: Prasad J Pandit + +While emulating IRC DCC commands, tcp_emu() uses 'mbuf' size +'m->m_size' to write DCC commands via snprintf(3). This may +lead to OOB write access, because 'bptr' points somewhere in +the middle of 'mbuf' buffer, not at the start. Use M_FREEROOM(m) +size to avoid OOB access. + +Reported-by: Vishnu Dev TJ +Signed-off-by: Prasad J Pandit +Reviewed-by: Samuel Thibault +Message-Id: <20200109094228.79764-2-ppandit@redhat.com> + +(cherry picked from libslirp commit ce131029d6d4a405cb7d3ac6716d03e58fb4a5d9) +Signed-off-by: Marc-André Lureau + +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tcp_subr.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index cbecd64..cedbfb2 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -778,7 +778,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, m->m_size, "DCC CHAT chat %lu %u%c\n", ++ m->m_len += snprintf(bptr, M_FREEROOM(m), ++ "DCC CHAT chat %lu %u%c\n", + (unsigned long)ntohl(so->so_faddr.s_addr), + ntohs(so->so_fport), 1); + } else if (sscanf(bptr, "DCC SEND %256s %u %u %u", buff, &laddr, &lport, +@@ -788,8 +789,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += +- snprintf(bptr, m->m_size, "DCC SEND %s %lu %u %u%c\n", buff, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), ++ "DCC SEND %s %lu %u %u%c\n", buff, + (unsigned long)ntohl(so->so_faddr.s_addr), + ntohs(so->so_fport), n1, 1); + } else if (sscanf(bptr, "DCC MOVE %256s %u %u %u", buff, &laddr, &lport, +@@ -799,8 +800,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += +- snprintf(bptr, m->m_size, "DCC MOVE %s %lu %u %u%c\n", buff, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), ++ "DCC MOVE %s %lu %u %u%c\n", buff, + (unsigned long)ntohl(so->so_faddr.s_addr), + ntohs(so->so_fport), n1, 1); + } +-- +1.8.3.1 + diff --git a/kvm-slirp-use-correct-size-while-emulating-commands.patch b/kvm-slirp-use-correct-size-while-emulating-commands.patch new file mode 100755 index 0000000..fe42f4f --- /dev/null +++ b/kvm-slirp-use-correct-size-while-emulating-commands.patch @@ -0,0 +1,71 @@ +From dfbfcf02738640ab83f7970e636b72b78f166675 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 17 Jan 2020 12:07:58 +0100 +Subject: [PATCH 3/4] slirp: use correct size while emulating commands +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200117120758.1076549-4-marcandre.lureau@redhat.com> +Patchwork-id: 93401 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm + RHEL-AV-8.2.0 qemu-kvm PATCH 3/3] slirp: use correct size while emulating commands +Bugzilla: 1791568 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +From: Prasad J Pandit + +While emulating services in tcp_emu(), it uses 'mbuf' size +'m->m_size' to write commands via snprintf(3). Use M_FREEROOM(m) +size to avoid possible OOB access. + +Signed-off-by: Prasad J Pandit +Signed-off-by: Samuel Thibault +Message-Id: <20200109094228.79764-3-ppandit@redhat.com> + +(cherry picked from commit 82ebe9c370a0e2970fb5695aa19aa5214a6a1c80) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tcp_subr.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index cedbfb2..954d1a6 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -696,7 +696,7 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, m->m_size - m->m_len, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), + "ORT %d,%d,%d,%d,%d,%d\r\n%s", n1, n2, n3, n4, + n5, n6, x == 7 ? buff : ""); + return 1; +@@ -731,8 +731,7 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += +- snprintf(bptr, m->m_size - m->m_len, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), + "27 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n%s", + n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); + +@@ -758,8 +757,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + if (m->m_data[m->m_len - 1] == '\0' && lport != 0 && + (so = tcp_listen(slirp, INADDR_ANY, 0, so->so_laddr.s_addr, + htons(lport), SS_FACCEPTONCE)) != NULL) +- m->m_len = +- snprintf(m->m_data, m->m_size, "%d", ntohs(so->so_fport)) + 1; ++ m->m_len = snprintf(m->m_data, M_ROOM(m), ++ "%d", ntohs(so->so_fport)) + 1; + return 1; + + case EMU_IRC: +-- +1.8.3.1 + diff --git a/kvm-softmmu-memory-Log-invalid-memory-accesses.patch b/kvm-softmmu-memory-Log-invalid-memory-accesses.patch new file mode 100755 index 0000000..e4e1bc4 --- /dev/null +++ b/kvm-softmmu-memory-Log-invalid-memory-accesses.patch @@ -0,0 +1,84 @@ +From be0a190e3c5c4ff84f7c53630ed5a55644d18acc Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:06 -0400 +Subject: [PATCH 7/7] softmmu/memory: Log invalid memory accesses +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-7-jmaloy@redhat.com> +Patchwork-id: 101481 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 6/6] softmmu/memory: Log invalid memory accesses +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Philippe Mathieu-Daudé + +Log invalid memory accesses with as GUEST_ERROR. + +This is particularly useful since commit 5d971f9e67 which reverted +("memory: accept mismatching sizes in memory_region_access_valid"). + +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Michael S. Tsirkin +Message-Id: <20201005152725.2143444-1-philmd@redhat.com> +Signed-off-by: Laurent Vivier + +(cherry picked from commit 21786c7e59847b1612406ff394958f22e5b323f8) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + memory.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/memory.c b/memory.c +index 0cfcb72a5a..660df8159a 100644 +--- a/memory.c ++++ b/memory.c +@@ -14,6 +14,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/log.h" + #include "qapi/error.h" + #include "cpu.h" + #include "exec/memory.h" +@@ -1353,10 +1354,18 @@ bool memory_region_access_valid(MemoryRegion *mr, + { + if (mr->ops->valid.accepts + && !mr->ops->valid.accepts(mr->opaque, addr, size, is_write, attrs)) { ++ qemu_log_mask(LOG_GUEST_ERROR, "Invalid access at addr " ++ "0x%" HWADDR_PRIX ", size %u, " ++ "region '%s', reason: rejected\n", ++ addr, size, memory_region_name(mr)); + return false; + } + + if (!mr->ops->valid.unaligned && (addr & (size - 1))) { ++ qemu_log_mask(LOG_GUEST_ERROR, "Invalid access at addr " ++ "0x%" HWADDR_PRIX ", size %u, " ++ "region '%s', reason: unaligned\n", ++ addr, size, memory_region_name(mr)); + return false; + } + +@@ -1367,6 +1376,13 @@ bool memory_region_access_valid(MemoryRegion *mr, + + if (size > mr->ops->valid.max_access_size + || size < mr->ops->valid.min_access_size) { ++ qemu_log_mask(LOG_GUEST_ERROR, "Invalid access at addr " ++ "0x%" HWADDR_PRIX ", size %u, " ++ "region '%s', reason: invalid size " ++ "(min:%u max:%u)\n", ++ addr, size, memory_region_name(mr), ++ mr->ops->valid.min_access_size, ++ mr->ops->valid.max_access_size); + return false; + } + return true; +-- +2.27.0 + diff --git a/kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch b/kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch new file mode 100755 index 0000000..7aaa982 --- /dev/null +++ b/kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch @@ -0,0 +1,205 @@ +From dfdf950e893c23e77c9dc0be18fca66ad195d260 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Wed, 10 Feb 2021 15:56:45 +0000 +Subject: [PATCH 2/2] spapr: Adjust firmware path of PCI devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Greg Kurz +Message-id: <20210210165645.470195-2-gkurz@redhat.com> +Patchwork-id: 101038 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] spapr: Adjust firmware path of PCI devices +Bugzilla: 1912891 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Gibson +RH-Acked-by: Laszlo Ersek + +From: Greg Kurz + +It is currently not possible to perform a strict boot from USB storage: + +$ qemu-system-ppc64 -accel kvm -nodefaults -nographic -serial stdio \ + -boot strict=on \ + -device qemu-xhci \ + -device usb-storage,drive=disk,bootindex=0 \ + -blockdev driver=file,node-name=disk,filename=fedora-ppc64le.qcow2 + +SLOF ********************************************************************** +QEMU Starting + Build Date = Jul 17 2020 11:15:24 + FW Version = git-e18ddad8516ff2cf + Press "s" to enter Open Firmware. + +Populating /vdevice methods +Populating /vdevice/vty@71000000 +Populating /vdevice/nvram@71000001 +Populating /pci@800000020000000 + 00 0000 (D) : 1b36 000d serial bus [ usb-xhci ] +No NVRAM common partition, re-initializing... +Scanning USB + XHCI: Initializing + USB Storage + SCSI: Looking for devices + 101000000000000 DISK : "QEMU QEMU HARDDISK 2.5+" +Using default console: /vdevice/vty@71000000 + + Welcome to Open Firmware + + Copyright (c) 2004, 2017 IBM Corporation All rights reserved. + This program and the accompanying materials are made available + under the terms of the BSD License available at + http://www.opensource.org/licenses/bsd-license.php + +Trying to load: from: /pci@800000020000000/usb@0/storage@1/disk@101000000000000 ... +E3405: No such device + +E3407: Load failed + + Type 'boot' and press return to continue booting the system. + Type 'reset-all' and press return to reboot the system. + +Ready! +0 > + +The device tree handed over by QEMU to SLOF indeed contains: + +qemu,boot-list = + "/pci@800000020000000/usb@0/storage@1/disk@101000000000000 HALT"; + +but the device node is named usb-xhci@0, not usb@0. + +This happens because the firmware names of PCI devices returned +by get_boot_devices_list() come from pcibus_get_fw_dev_path(), +while the sPAPR PHB code uses a different naming scheme for +device nodes. This inconsistency has always been there but it was +hidden for a long time because SLOF used to rename USB device +nodes, until this commit, merged in QEMU 4.2.0 : + +commit 85164ad4ed9960cac842fa4cc067c6b6699b0994 +Author: Alexey Kardashevskiy +Date: Wed Sep 11 16:24:32 2019 +1000 + + pseries: Update SLOF firmware image + + This fixes USB host bus adapter name in the device tree to match QEMU's + one. + + Signed-off-by: Alexey Kardashevskiy + Signed-off-by: David Gibson + +Fortunately, sPAPR implements the firmware path provider interface. +This provides a way to override the default firmware paths. + +Just factor out the sPAPR PHB naming logic from spapr_dt_pci_device() +to a helper, and use it in the sPAPR firmware path provider hook. + +Fixes: 85164ad4ed99 ("pseries: Update SLOF firmware image") +Signed-off-by: Greg Kurz +Message-Id: <20210122170157.246374-1-groug@kaod.org> +Reviewed-by: Daniel Henrique Barboza +Signed-off-by: David Gibson +(cherry picked from commit 040bdafce12f750816d879442014df2999a995c4) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 5 +++++ + hw/ppc/spapr_pci.c | 33 ++++++++++++++++++--------------- + include/hw/pci-host/spapr.h | 2 ++ + 3 files changed, 25 insertions(+), 15 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 00b1ef075e..bee2299199 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -3013,6 +3013,7 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, + SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE); + SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE); + VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON); ++ PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE); + + if (d) { + void *spapr = CAST(void, bus->parent, "spapr-vscsi"); +@@ -3086,6 +3087,10 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, + return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn)); + } + ++ if (pcidev) { ++ return spapr_pci_fw_dev_name(pcidev); ++ } ++ + return NULL; + } + +diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c +index f6fbcf99ed..befa570aa8 100644 +--- a/hw/ppc/spapr_pci.c ++++ b/hw/ppc/spapr_pci.c +@@ -1348,15 +1348,29 @@ static int spapr_dt_pci_bus(SpaprPhbState *sphb, PCIBus *bus, + return offset; + } + ++char *spapr_pci_fw_dev_name(PCIDevice *dev) ++{ ++ const gchar *basename; ++ int slot = PCI_SLOT(dev->devfn); ++ int func = PCI_FUNC(dev->devfn); ++ uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3); ++ ++ basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff, ++ ccode & 0xff); ++ ++ if (func != 0) { ++ return g_strdup_printf("%s@%x,%x", basename, slot, func); ++ } else { ++ return g_strdup_printf("%s@%x", basename, slot); ++ } ++} ++ + /* create OF node for pci device and required OF DT properties */ + static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev, + void *fdt, int parent_offset) + { + int offset; +- const gchar *basename; +- gchar *nodename; +- int slot = PCI_SLOT(dev->devfn); +- int func = PCI_FUNC(dev->devfn); ++ g_autofree gchar *nodename = spapr_pci_fw_dev_name(dev); + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + ResourceProps rp; + SpaprDrc *drc = drc_from_dev(sphb, dev); +@@ -1373,19 +1387,8 @@ static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev, + uint32_t pci_status = pci_default_read_config(dev, PCI_STATUS, 2); + gchar *loc_code; + +- basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff, +- ccode & 0xff); +- +- if (func != 0) { +- nodename = g_strdup_printf("%s@%x,%x", basename, slot, func); +- } else { +- nodename = g_strdup_printf("%s@%x", basename, slot); +- } +- + _FDT(offset = fdt_add_subnode(fdt, parent_offset, nodename)); + +- g_free(nodename); +- + /* in accordance with PAPR+ v2.7 13.6.3, Table 181 */ + _FDT(fdt_setprop_cell(fdt, offset, "vendor-id", vendor_id)); + _FDT(fdt_setprop_cell(fdt, offset, "device-id", device_id)); +diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h +index 8877ff51fb..9522db9047 100644 +--- a/include/hw/pci-host/spapr.h ++++ b/include/hw/pci-host/spapr.h +@@ -212,4 +212,6 @@ static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb) + return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1; + } + ++char *spapr_pci_fw_dev_name(PCIDevice *dev); ++ + #endif /* PCI_HOST_SPAPR_H */ +-- +2.27.0 + diff --git a/kvm-spapr-Allow-memory-unplug-to-always-succeed.patch b/kvm-spapr-Allow-memory-unplug-to-always-succeed.patch new file mode 100755 index 0000000..2968267 --- /dev/null +++ b/kvm-spapr-Allow-memory-unplug-to-always-succeed.patch @@ -0,0 +1,101 @@ +From 1fc9b693c54c93736c6f902f3df8b94440e8cc5d Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:53 -0500 +Subject: [PATCH 5/9] spapr: Allow memory unplug to always succeed + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-6-gkurz@redhat.com> +Patchwork-id: 100686 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 5/6] spapr: Allow memory unplug to always succeed +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +It is currently impossible to hot-unplug a memory device between +machine reset and CAS. + +(qemu) device_del dimm1 +Error: Memory hot unplug not supported for this guest + +This limitation was introduced in order to provide an explicit +error path for older guests that didn't support hot-plug event +sources (and thus memory hot-unplug). + +The linux kernel has been supporting these since 4.11. All recent +enough guests are thus capable of handling the removal of a memory +device at all time, including during early boot. + +Lift the limitation for the latest machine type. This means that +trying to unplug memory from a guest that doesn't support it will +likely just do nothing and the memory will only get removed at +next reboot. Such older guests can still get the existing behavior +by using an older machine type. + +Signed-off-by: Greg Kurz +Message-Id: <160794035064.23292.17560963281911312439.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 1e8b5b1aa16b7d73ba8ba52c95d0b52329d5c9d0) +Signed-off-by: Greg Kurz + +Conflicts: + hw/ppc/spapr.c + include/hw/ppc/spapr.h + +Conflicts around the addition of pre_6_0_memory_unplug. Ignore the +change that sets pre_6_0_memory_unplug for older machine types. +This is ok because pre_6_0_memory_unplug is removed in a subsequent +patch anyway. + +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 3 ++- + hw/ppc/spapr_events.c | 3 ++- + include/hw/ppc/spapr.h | 1 + + 3 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 992bd08aaa..f8de33e3e5 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -4001,7 +4001,8 @@ static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { +- if (spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) { ++ if (!smc->pre_6_0_memory_unplug || ++ spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) { + spapr_memory_unplug_request(hotplug_dev, dev, errp); + } else { + /* NOTE: this means there is a window after guest reset, prior to +diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c +index 15b92b63ad..6e284aa4bc 100644 +--- a/hw/ppc/spapr_events.c ++++ b/hw/ppc/spapr_events.c +@@ -547,7 +547,8 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action, + /* we should not be using count_indexed value unless the guest + * supports dedicated hotplug event source + */ +- g_assert(spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT)); ++ g_assert(!SPAPR_MACHINE_GET_CLASS(spapr)->pre_6_0_memory_unplug || ++ spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT)); + hp->drc_id.count_indexed.count = + cpu_to_be32(drc_id->count_indexed.count); + hp->drc_id.count_indexed.index = +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index e5e2a99046..ac6961ed16 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -124,6 +124,7 @@ struct SpaprMachineClass { + bool pre_4_1_migration; /* don't migrate hpt-max-page-size */ + bool linux_pci_probe; + bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ ++ bool pre_6_0_memory_unplug; + + bool has_power9_support; + void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, +-- +2.18.2 + diff --git a/kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch b/kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch new file mode 100755 index 0000000..d934712 --- /dev/null +++ b/kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch @@ -0,0 +1,113 @@ +From f2aeed761d2dad14920fa08c977dc45564886d9b Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Fri, 3 Jan 2020 01:15:12 +0000 +Subject: [PATCH 1/5] spapr: Don't trigger a CAS reboot for XICS/XIVE mode + changeover +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200103011512.49129-2-dgibson@redhat.com> +Patchwork-id: 93261 +O-Subject: [RHEL-AV-4.2 qemu-kvm PATCH 1/1] spapr: Don't trigger a CAS reboot for XICS/XIVE mode changeover +Bugzilla: 1733893 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: David Gibson + +PAPR allows the interrupt controller used on a POWER9 machine (XICS or +XIVE) to be selected by the guest operating system, by using the +ibm,client-architecture-support (CAS) feature negotiation call. + +Currently, if the guest selects an interrupt controller different from the +one selected at initial boot, this causes the system to be reset with the +new model and the boot starts again. This means we run through the SLOF +boot process twice, as well as any other bootloader (e.g. grub) in use +before the OS calls CAS. This can be confusing and/or inconvenient for +users. + +Thanks to two fairly recent changes, we no longer need this reboot. 1) we +now completely regenerate the device tree when CAS is called (meaning we +don't need special case updates for all the device tree changes caused by +the interrupt controller mode change), 2) we now have explicit code paths +to activate and deactivate the different interrupt controllers, rather than +just implicitly calling those at machine reset time. + +We can therefore eliminate the reboot for changing irq mode, simply by +putting a call to spapr_irq_update_active_intc() before we call +spapr_h_cas_compose_response() (which gives the updated device tree to +the guest firmware and OS). + +Signed-off-by: David Gibson +Reviewed-by: Cedric Le Goater +Reviewed-by: Greg Kurz +(cherry picked from commit 8deb8019d696c75e6ecaee7545026b62aba2f1bb) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1733893 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_hcall.c | 33 +++++++++++++-------------------- + 1 file changed, 13 insertions(+), 20 deletions(-) + +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 140f05c..05a7ca2 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1767,21 +1767,10 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + } + spapr->cas_pre_isa3_guest = !spapr_ovec_test(ov1_guest, OV1_PPC_3_00); + spapr_ovec_cleanup(ov1_guest); +- if (!spapr->cas_reboot) { +- /* If spapr_machine_reset() did not set up a HPT but one is necessary +- * (because the guest isn't going to use radix) then set it up here. */ +- if ((spapr->patb_entry & PATE1_GR) && !guest_radix) { +- /* legacy hash or new hash: */ +- spapr_setup_hpt_and_vrma(spapr); +- } +- spapr->cas_reboot = +- (spapr_h_cas_compose_response(spapr, args[1], args[2], +- ov5_updates) != 0); +- } + + /* +- * Ensure the guest asks for an interrupt mode we support; otherwise +- * terminate the boot. ++ * Ensure the guest asks for an interrupt mode we support; ++ * otherwise terminate the boot. + */ + if (guest_xive) { + if (!spapr->irq->xive) { +@@ -1797,14 +1786,18 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + } + } + +- /* +- * Generate a machine reset when we have an update of the +- * interrupt mode. Only required when the machine supports both +- * modes. +- */ ++ spapr_irq_update_active_intc(spapr); ++ + if (!spapr->cas_reboot) { +- spapr->cas_reboot = spapr_ovec_test(ov5_updates, OV5_XIVE_EXPLOIT) +- && spapr->irq->xics && spapr->irq->xive; ++ /* If spapr_machine_reset() did not set up a HPT but one is necessary ++ * (because the guest isn't going to use radix) then set it up here. */ ++ if ((spapr->patb_entry & PATE1_GR) && !guest_radix) { ++ /* legacy hash or new hash: */ ++ spapr_setup_hpt_and_vrma(spapr); ++ } ++ spapr->cas_reboot = ++ (spapr_h_cas_compose_response(spapr, args[1], args[2], ++ ov5_updates) != 0); + } + + spapr_ovec_cleanup(ov5_updates); +-- +1.8.3.1 + diff --git a/kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch b/kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch new file mode 100755 index 0000000..1462d52 --- /dev/null +++ b/kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch @@ -0,0 +1,145 @@ +From ad7aaf34400b1bbd41bbec182fd5895eaad50932 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:51 -0500 +Subject: [PATCH 3/9] spapr: Don't use spapr_drc_needed() in CAS code + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-4-gkurz@redhat.com> +Patchwork-id: 100683 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 3/6] spapr: Don't use spapr_drc_needed() in CAS code +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +We currently don't support hotplug of devices between boot and CAS. If +this happens a CAS reboot is triggered. We detect this during CAS using +the spapr_drc_needed() function which is essentially a VMStateDescription +.needed callback. Even if the condition for CAS reboot happens to be the +same as for DRC migration, it looks wrong to piggyback a migration helper +for this. + +Introduce a helper with slightly more explicit name and use it in both CAS +and DRC migration code. Since a subsequent patch will enhance this helper +to cover the case of hot unplug, let's go for spapr_drc_transient(). While +here convert spapr_hotplugged_dev_before_cas() to the "transient" wording as +well. + +This doesn't change any behaviour. + +Signed-off-by: Greg Kurz +Message-Id: <158169248180.3465937.9531405453362718771.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 4b63db1289a9e597bc151fa5e4d72f882cb6de1e) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr_drc.c | 20 ++++++++++++++------ + hw/ppc/spapr_hcall.c | 14 +++++++++----- + include/hw/ppc/spapr_drc.h | 4 +++- + 3 files changed, 26 insertions(+), 12 deletions(-) + +diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c +index 62f1a42592..9b498d429e 100644 +--- a/hw/ppc/spapr_drc.c ++++ b/hw/ppc/spapr_drc.c +@@ -455,23 +455,31 @@ void spapr_drc_reset(SpaprDrc *drc) + } + } + +-bool spapr_drc_needed(void *opaque) ++bool spapr_drc_transient(SpaprDrc *drc) + { +- SpaprDrc *drc = (SpaprDrc *)opaque; + SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); + +- /* If no dev is plugged in there is no need to migrate the DRC state */ ++ /* ++ * If no dev is plugged in there is no need to migrate the DRC state ++ * nor to reset the DRC at CAS. ++ */ + if (!drc->dev) { + return false; + } + + /* +- * We need to migrate the state if it's not equal to the expected +- * long-term state, which is the same as the coldplugged initial +- * state */ ++ * We need to reset the DRC at CAS or to migrate the DRC state if it's ++ * not equal to the expected long-term state, which is the same as the ++ * coldplugged initial state. ++ */ + return (drc->state != drck->ready_state); + } + ++static bool spapr_drc_needed(void *opaque) ++{ ++ return spapr_drc_transient(opaque); ++} ++ + static const VMStateDescription vmstate_spapr_drc = { + .name = "spapr_drc", + .version_id = 1, +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 0f19be794c..d70e643752 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1640,20 +1640,24 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, + return best_compat; + } + +-static bool spapr_hotplugged_dev_before_cas(void) ++static bool spapr_transient_dev_before_cas(void) + { +- Object *drc_container, *obj; ++ Object *drc_container; + ObjectProperty *prop; + ObjectPropertyIterator iter; + + drc_container = container_get(object_get_root(), "/dr-connector"); + object_property_iter_init(&iter, drc_container); + while ((prop = object_property_iter_next(&iter))) { ++ SpaprDrc *drc; ++ + if (!strstart(prop->type, "link<", NULL)) { + continue; + } +- obj = object_property_get_link(drc_container, prop->name, NULL); +- if (spapr_drc_needed(obj)) { ++ drc = SPAPR_DR_CONNECTOR(object_property_get_link(drc_container, ++ prop->name, NULL)); ++ ++ if (spapr_drc_transient(drc)) { + return true; + } + } +@@ -1812,7 +1816,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + + spapr_irq_update_active_intc(spapr); + +- if (spapr_hotplugged_dev_before_cas()) { ++ if (spapr_transient_dev_before_cas()) { + spapr->cas_reboot = true; + } + +diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h +index 83f03cc577..7e09d57114 100644 +--- a/include/hw/ppc/spapr_drc.h ++++ b/include/hw/ppc/spapr_drc.h +@@ -269,7 +269,9 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, uint32_t drc_type_mask); + + void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp); + void spapr_drc_detach(SpaprDrc *drc); +-bool spapr_drc_needed(void *opaque); ++ ++/* Returns true if a hot plug/unplug request is pending */ ++bool spapr_drc_transient(SpaprDrc *drc); + + static inline bool spapr_drc_unplug_requested(SpaprDrc *drc) + { +-- +2.18.2 + diff --git a/kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch b/kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch new file mode 100755 index 0000000..0aa782b --- /dev/null +++ b/kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch @@ -0,0 +1,135 @@ +From eb121ffa97c1c25d7853d51b4c8209c0bb521deb Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Fri, 7 Feb 2020 00:57:04 +0000 +Subject: [PATCH 1/7] spapr: Enable DD2.3 accelerated count cache flush in + pseries-5.0 machine + +RH-Author: David Gibson +Message-id: <20200207005704.194428-1-dgibson@redhat.com> +Patchwork-id: 93737 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCHv2] spapr: Enable DD2.3 accelerated count cache flush in pseries-5.0 machine +Bugzilla: 1796240 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: David Gibson + +For POWER9 DD2.2 cpus, the best current Spectre v2 indirect branch +mitigation is "count cache disabled", which is configured with: + -machine cap-ibs=fixed-ccd +However, this option isn't available on DD2.3 CPUs with KVM, because they +don't have the count cache disabled. + +For POWER9 DD2.3 cpus, it is "count cache flush with assist", configured +with: + -machine cap-ibs=workaround,cap-ccf-assist=on +However this option isn't available on DD2.2 CPUs with KVM, because they +don't have the special CCF assist instruction this relies on. + +On current machine types, we default to "count cache flush w/o assist", +that is: + -machine cap-ibs=workaround,cap-ccf-assist=off +This runs, with mitigation on both DD2.2 and DD2.3 host cpus, but has a +fairly significant performance impact. + +It turns out we can do better. The special instruction that CCF assist +uses to trigger a count cache flush is a no-op on earlier CPUs, rather than +trapping or causing other badness. It doesn't, of itself, implement the +mitigation, but *if* we have count-cache-disabled, then the count cache +flush is unnecessary, and so using the count cache flush mitigation is +harmless. + +Therefore for the new pseries-5.0 machine type, enable cap-ccf-assist by +default. Along with that, suppress throwing an error if cap-ccf-assist +is selected but KVM doesn't support it, as long as KVM *is* giving us +count-cache-disabled. To allow TCG to work out of the box, even though it +doesn't implement the ccf flush assist, downgrade the error in that case to +a warning. This matches several Spectre mitigations where we allow TCG +to operate for debugging, since we don't really make guarantees about TCG +security properties anyway. + +While we're there, make the TCG warning for this case match that for other +mitigations. + +Signed-off-by: David Gibson +Tested-by: Michael Ellerman +(cherry picked from commit 37965dfe4dffa3ac49438337417608e7f346b58a) +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + hw/ppc/spapr.c + +Adjusted machine version compatibility code to the RHEL machine types +rather than the upstream machine types. + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1796240 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=26285002 +Branch: rhel-av-8.2.0 +Upstream: Merged for qemu-5.0 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 4 +++- + hw/ppc/spapr_caps.c | 21 +++++++++++++++++---- + 2 files changed, 20 insertions(+), 5 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index c12862d..a330f03 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -4440,7 +4440,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) + smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */ + smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF; + smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON; +- smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF; ++ smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON; + spapr_caps_add_properties(smc, &error_abort); + smc->irq = &spapr_irq_dual; + smc->dr_phb_enabled = true; +@@ -4904,6 +4904,8 @@ static void spapr_machine_rhel810_class_options(MachineClass *mc) + hw_compat_rhel_8_1_len); + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); + ++ /* from pseries-4.2 */ ++ smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF; + } + + DEFINE_SPAPR_MACHINE(rhel810, "rhel8.1.0", false); +diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c +index 805f385..6e6fb28 100644 +--- a/hw/ppc/spapr_caps.c ++++ b/hw/ppc/spapr_caps.c +@@ -492,11 +492,24 @@ static void cap_ccf_assist_apply(SpaprMachineState *spapr, uint8_t val, + uint8_t kvm_val = kvmppc_get_cap_count_cache_flush_assist(); + + if (tcg_enabled() && val) { +- /* TODO - for now only allow broken for TCG */ +- error_setg(errp, +-"Requested count cache flush assist capability level not supported by tcg," +- " try appending -machine cap-ccf-assist=off"); ++ /* TCG doesn't implement anything here, but allow with a warning */ ++ warn_report("TCG doesn't support requested feature, cap-ccf-assist=on"); + } else if (kvm_enabled() && (val > kvm_val)) { ++ uint8_t kvm_ibs = kvmppc_get_cap_safe_indirect_branch(); ++ ++ if (kvm_ibs == SPAPR_CAP_FIXED_CCD) { ++ /* ++ * If we don't have CCF assist on the host, the assist ++ * instruction is a harmless no-op. It won't correctly ++ * implement the cache count flush *but* if we have ++ * count-cache-disabled in the host, that flush is ++ * unnnecessary. So, specifically allow this case. This ++ * allows us to have better performance on POWER9 DD2.3, ++ * while still working on POWER9 DD2.2 and POWER8 host ++ * cpus. ++ */ ++ return; ++ } + error_setg(errp, + "Requested count cache flush assist capability level not supported by kvm," + " try appending -machine cap-ccf-assist=off"); +-- +1.8.3.1 + diff --git a/kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch b/kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch new file mode 100755 index 0000000..8d30406 --- /dev/null +++ b/kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch @@ -0,0 +1,165 @@ +From f9d332b1280cd3f6009b59323719548a36a7c52b Mon Sep 17 00:00:00 2001 +From: Daniel Henrique Barboza +Date: Mon, 21 Jun 2021 14:40:24 -0400 +Subject: [PATCH 2/4] spapr: Fix EEH capability issue on KVM guest for PCI + passthru + +RH-Author: Daniel Henrique Barboza +Message-id: <20210621144024.199732-2-dbarboza@redhat.com> +Patchwork-id: 101740 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] spapr: Fix EEH capability issue on KVM guest for PCI passthru +Bugzilla: 1957866 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Greg Kurz +RH-Acked-by: David Gibson + +From: Mahesh Salgaonkar + +With upstream kernel, especially after commit 98ba956f6a389 +("powerpc/pseries/eeh: Rework device EEH PE determination") we see that KVM +guest isn't able to enable EEH option for PCI pass-through devices anymore. + +[root@atest-guest ~]# dmesg | grep EEH +[ 0.032337] EEH: pSeries platform initialized +[ 0.298207] EEH: No capable adapters found: recovery disabled. +[root@atest-guest ~]# + +So far the linux kernel was assuming pe_config_addr equal to device's +config_addr and using it to enable EEH on the PE through ibm,set-eeh-option +RTAS call. Which wasn't the correct way as per PAPR. The linux kernel +commit 98ba956f6a389 fixed this flow. With that fixed, linux now uses PE +config address returned by ibm,get-config-addr-info2 RTAS call to enable +EEH option per-PE basis instead of per-device basis. However this has +uncovered a bug in qemu where ibm,set-eeh-option is treating PE config +address as per-device config address. + +Hence in qemu guest with recent kernel the ibm,set-eeh-option RTAS call +fails with -3 return value indicating that there is no PCI device exist for +the specified PE config address. The rtas_ibm_set_eeh_option call uses +pci_find_device() to get the PC device that matches specific bus and devfn +extracted from PE config address passed as argument. Thus it tries to map +the PE config address to a single specific PCI device 'bus->devices[devfn]' +which always results into checking device on slot 0 'bus->devices[0]'. +This succeeds when there is a pass-through device (vfio-pci) present on +slot 0. But in cases where there is no pass-through device present in slot +0, but present in non-zero slots, ibm,set-eeh-option call fails to enable +the EEH capability. + +hw/ppc/spapr_pci_vfio.c: spapr_phb_vfio_eeh_set_option() + case RTAS_EEH_ENABLE: { + PCIHostState *phb; + PCIDevice *pdev; + + /* + * The EEH functionality is enabled on basis of PCI device, + * instead of PE. We need check the validity of the PCI + * device address. + */ + phb = PCI_HOST_BRIDGE(sphb); + pdev = pci_find_device(phb->bus, + (addr >> 16) & 0xFF, (addr >> 8) & 0xFF); + if (!pdev || !object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + return RTAS_OUT_PARAM_ERROR; + } + +hw/pci/pci.c:pci_find_device() + +PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn) +{ + bus = pci_find_bus_nr(bus, bus_num); + + if (!bus) + return NULL; + + return bus->devices[devfn]; +} + +This patch fixes ibm,set-eeh-option to check for presence of any PCI device +(vfio-pci) under specified bus and enable the EEH if found. The current +code already makes sure that all the devices on that bus are from same +iommu group (within same PE) and fail very early if it does not. + +After this fix guest is able to find EEH capable devices and enable EEH +recovery on it. + +[root@atest-guest ~]# dmesg | grep EEH +[ 0.048139] EEH: pSeries platform initialized +[ 0.405115] EEH: Capable adapter found: recovery enabled. +[root@atest-guest ~]# + +Reviewed-by: Daniel Henrique Barboza +Signed-off-by: Mahesh Salgaonkar +Message-Id: <162158429107.145117.5843504911924013125.stgit@jupiter> +Signed-off-by: David Gibson +(cherry picked from commit ac9ef668321ebb6eb871a0c4dd380fa7d7891b4e) +Signed-off-by: Daniel Henrique Barboza +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_pci_vfio.c | 40 +++++++++++++++++++++++++++++++++------- + 1 file changed, 33 insertions(+), 7 deletions(-) + +diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c +index ecb34aaade..a411b08d60 100644 +--- a/hw/ppc/spapr_pci_vfio.c ++++ b/hw/ppc/spapr_pci_vfio.c +@@ -48,6 +48,16 @@ void spapr_phb_vfio_reset(DeviceState *qdev) + spapr_phb_vfio_eeh_reenable(SPAPR_PCI_HOST_BRIDGE(qdev)); + } + ++static void spapr_eeh_pci_find_device(PCIBus *bus, PCIDevice *pdev, ++ void *opaque) ++{ ++ bool *found = opaque; ++ ++ if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { ++ *found = true; ++ } ++} ++ + int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb, + unsigned int addr, int option) + { +@@ -60,17 +70,33 @@ int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb, + break; + case RTAS_EEH_ENABLE: { + PCIHostState *phb; +- PCIDevice *pdev; ++ bool found = false; + + /* +- * The EEH functionality is enabled on basis of PCI device, +- * instead of PE. We need check the validity of the PCI +- * device address. ++ * The EEH functionality is enabled per sphb level instead of ++ * per PCI device. We have already identified this specific sphb ++ * based on buid passed as argument to ibm,set-eeh-option rtas ++ * call. Now we just need to check the validity of the PCI ++ * pass-through devices (vfio-pci) under this sphb bus. ++ * We have already validated that all the devices under this sphb ++ * are from same iommu group (within same PE) before comming here. ++ * ++ * Prior to linux commit 98ba956f6a389 ("powerpc/pseries/eeh: ++ * Rework device EEH PE determination") kernel would call ++ * eeh-set-option for each device in the PE using the device's ++ * config_address as the argument rather than the PE address. ++ * Hence if we check validity of supplied config_addr whether ++ * it matches to this PHB will cause issues with older kernel ++ * versions v5.9 and older. If we return an error from ++ * eeh-set-option when the argument isn't a valid PE address ++ * then older kernels (v5.9 and older) will interpret that as ++ * EEH not being supported. + */ + phb = PCI_HOST_BRIDGE(sphb); +- pdev = pci_find_device(phb->bus, +- (addr >> 16) & 0xFF, (addr >> 8) & 0xFF); +- if (!pdev || !object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { ++ pci_for_each_device(phb->bus, (addr >> 16) & 0xFF, ++ spapr_eeh_pci_find_device, &found); ++ ++ if (!found) { + return RTAS_OUT_PARAM_ERROR; + } + +-- +2.27.0 + diff --git a/kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch b/kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch new file mode 100755 index 0000000..c14aa7d --- /dev/null +++ b/kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch @@ -0,0 +1,105 @@ +From 9ebed8090b88282f9b7432258df9182b9d3944ee Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:52 -0500 +Subject: [PATCH 4/9] spapr: Fix handling of unplugged devices during CAS and + migration + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-5-gkurz@redhat.com> +Patchwork-id: 100685 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 4/6] spapr: Fix handling of unplugged devices during CAS and migration +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +We already detect if a device is being hot plugged before CAS to trigger +a CAS reboot and during migration to migrate the state of the associated +DRC. But hot unplugging a device is also an asynchronous operation that +requires the guest to take action. This means that if the guest is migrated +after the hot unplug event was sent but before it could release the device +with RTAS, the destination QEMU doesn't know about the pending unplug +operation and doesn't actually remove the device when the guest finally +releases it. + +Similarly, if the unplug request is fired before CAS, the guest isn't +notified of the change, just like with hotplug. It ends up booting with +the device still present in the DT and configures it, just like it was +never removed. Even weirder, since the event is still queued, it will +be eventually processed when some other unrelated event is posted to +the guest. + +Enhance spapr_drc_transient() to also return true if an unplug request is +pending. This fixes the issue at CAS with a CAS reboot request and +causes the DRC state to be migrated. Some extra care is still needed to +inform the destination that an unplug request is pending : migrate the +unplug_requested field of the DRC in an optional subsection. This might +break backwards migration, but this is still better than ending with +an inconsistent guest. + +Signed-off-by: Greg Kurz +Message-Id: <158169248798.3465937.1108351365840514270.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit ab8584349c476f9818dc6403359c85f9ab0ad5eb) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr_drc.c | 25 +++++++++++++++++++++++-- + 1 file changed, 23 insertions(+), 2 deletions(-) + +diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c +index 9b498d429e..897bb7aae0 100644 +--- a/hw/ppc/spapr_drc.c ++++ b/hw/ppc/spapr_drc.c +@@ -455,6 +455,22 @@ void spapr_drc_reset(SpaprDrc *drc) + } + } + ++static bool spapr_drc_unplug_requested_needed(void *opaque) ++{ ++ return spapr_drc_unplug_requested(opaque); ++} ++ ++static const VMStateDescription vmstate_spapr_drc_unplug_requested = { ++ .name = "spapr_drc/unplug_requested", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = spapr_drc_unplug_requested_needed, ++ .fields = (VMStateField []) { ++ VMSTATE_BOOL(unplug_requested, SpaprDrc), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ + bool spapr_drc_transient(SpaprDrc *drc) + { + SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); +@@ -470,9 +486,10 @@ bool spapr_drc_transient(SpaprDrc *drc) + /* + * We need to reset the DRC at CAS or to migrate the DRC state if it's + * not equal to the expected long-term state, which is the same as the +- * coldplugged initial state. ++ * coldplugged initial state, or if an unplug request is pending. + */ +- return (drc->state != drck->ready_state); ++ return drc->state != drck->ready_state || ++ spapr_drc_unplug_requested(drc); + } + + static bool spapr_drc_needed(void *opaque) +@@ -488,6 +505,10 @@ static const VMStateDescription vmstate_spapr_drc = { + .fields = (VMStateField []) { + VMSTATE_UINT32(state, SpaprDrc), + VMSTATE_END_OF_LIST() ++ }, ++ .subsections = (const VMStateDescription * []) { ++ &vmstate_spapr_drc_unplug_requested, ++ NULL + } + }; + +-- +2.18.2 + diff --git a/kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch b/kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch new file mode 100755 index 0000000..b0ca288 --- /dev/null +++ b/kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch @@ -0,0 +1,246 @@ +From cb9d5380b1376b2a44d91d84eaf09f948ef1e165 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:50 -0500 +Subject: [PATCH 2/9] spapr: Fold h_cas_compose_response() into + h_client_architecture_support() + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-3-gkurz@redhat.com> +Patchwork-id: 100687 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 2/6] spapr: Fold h_cas_compose_response() into h_client_architecture_support() +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: David Gibson + +spapr_h_cas_compose_response() handles the last piece of the PAPR feature +negotiation process invoked via the ibm,client-architecture-support OF +call. Its only caller is h_client_architecture_support() which handles +most of the rest of that process. + +I believe it was placed in a separate file originally to handle some +fiddly dependencies between functions, but mostly it's just confusing +to have the CAS process split into two pieces like this. Now that +compose response is simplified (by just generating the whole device +tree anew), it's cleaner to just fold it into +h_client_architecture_support(). + +Signed-off-by: David Gibson +Reviewed-by: Cedric Le Goater +Reviewed-by: Greg Kurz +(cherry picked from commit 0c21e073541cc093b4cb8744640e24f130e6f8ba) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 61 +----------------------------------------- + hw/ppc/spapr_hcall.c | 55 ++++++++++++++++++++++++++++++++++--- + include/hw/ppc/spapr.h | 4 +-- + 3 files changed, 54 insertions(+), 66 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 92f63ad035..992bd08aaa 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -76,7 +76,6 @@ + #include "hw/nmi.h" + #include "hw/intc/intc.h" + +-#include "qemu/cutils.h" + #include "hw/ppc/spapr_cpu_core.h" + #include "hw/mem/memory-device.h" + #include "hw/ppc/spapr_tpm_proxy.h" +@@ -898,63 +897,6 @@ out: + return ret; + } + +-static bool spapr_hotplugged_dev_before_cas(void) +-{ +- Object *drc_container, *obj; +- ObjectProperty *prop; +- ObjectPropertyIterator iter; +- +- drc_container = container_get(object_get_root(), "/dr-connector"); +- object_property_iter_init(&iter, drc_container); +- while ((prop = object_property_iter_next(&iter))) { +- if (!strstart(prop->type, "link<", NULL)) { +- continue; +- } +- obj = object_property_get_link(drc_container, prop->name, NULL); +- if (spapr_drc_needed(obj)) { +- return true; +- } +- } +- return false; +-} +- +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, +- size_t space); +- +-int spapr_h_cas_compose_response(SpaprMachineState *spapr, +- target_ulong addr, target_ulong size, +- SpaprOptionVector *ov5_updates) +-{ +- void *fdt; +- SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 }; +- +- if (spapr_hotplugged_dev_before_cas()) { +- return 1; +- } +- +- if (size < sizeof(hdr)) { +- error_report("SLOF provided insufficient CAS buffer " +- TARGET_FMT_lu " (min: %zu)", size, sizeof(hdr)); +- exit(EXIT_FAILURE); +- } +- +- size -= sizeof(hdr); +- +- fdt = spapr_build_fdt(spapr, false, size); +- _FDT((fdt_pack(fdt))); +- +- cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); +- cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); +- trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); +- +- g_free(spapr->fdt_blob); +- spapr->fdt_size = fdt_totalsize(fdt); +- spapr->fdt_initial_size = spapr->fdt_size; +- spapr->fdt_blob = fdt; +- +- return 0; +-} +- + static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) + { + MachineState *ms = MACHINE(spapr); +@@ -1192,8 +1134,7 @@ static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt) + } + } + +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, +- size_t space) ++void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space) + { + MachineState *machine = MACHINE(spapr); + MachineClass *mc = MACHINE_GET_CLASS(machine); +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 05a7ca275b..0f19be794c 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1,4 +1,5 @@ + #include "qemu/osdep.h" ++#include "qemu/cutils.h" + #include "qapi/error.h" + #include "sysemu/hw_accel.h" + #include "sysemu/runstate.h" +@@ -15,6 +16,7 @@ + #include "cpu-models.h" + #include "trace.h" + #include "kvm_ppc.h" ++#include "hw/ppc/fdt.h" + #include "hw/ppc/spapr_ovec.h" + #include "mmu-book3s-v3.h" + #include "hw/mem/memory-device.h" +@@ -1638,6 +1640,26 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, + return best_compat; + } + ++static bool spapr_hotplugged_dev_before_cas(void) ++{ ++ Object *drc_container, *obj; ++ ObjectProperty *prop; ++ ObjectPropertyIterator iter; ++ ++ drc_container = container_get(object_get_root(), "/dr-connector"); ++ object_property_iter_init(&iter, drc_container); ++ while ((prop = object_property_iter_next(&iter))) { ++ if (!strstart(prop->type, "link<", NULL)) { ++ continue; ++ } ++ obj = object_property_get_link(drc_container, prop->name, NULL); ++ if (spapr_drc_needed(obj)) { ++ return true; ++ } ++ } ++ return false; ++} ++ + static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, +@@ -1645,6 +1667,8 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + { + /* Working address in data buffer */ + target_ulong addr = ppc64_phys_to_real(args[0]); ++ target_ulong fdt_buf = args[1]; ++ target_ulong fdt_bufsize = args[2]; + target_ulong ov_table; + uint32_t cas_pvr; + SpaprOptionVector *ov1_guest, *ov5_guest, *ov5_cas_old, *ov5_updates; +@@ -1788,16 +1812,41 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + + spapr_irq_update_active_intc(spapr); + ++ if (spapr_hotplugged_dev_before_cas()) { ++ spapr->cas_reboot = true; ++ } ++ + if (!spapr->cas_reboot) { ++ void *fdt; ++ SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 }; ++ + /* If spapr_machine_reset() did not set up a HPT but one is necessary + * (because the guest isn't going to use radix) then set it up here. */ + if ((spapr->patb_entry & PATE1_GR) && !guest_radix) { + /* legacy hash or new hash: */ + spapr_setup_hpt_and_vrma(spapr); + } +- spapr->cas_reboot = +- (spapr_h_cas_compose_response(spapr, args[1], args[2], +- ov5_updates) != 0); ++ ++ if (fdt_bufsize < sizeof(hdr)) { ++ error_report("SLOF provided insufficient CAS buffer " ++ TARGET_FMT_lu " (min: %zu)", fdt_bufsize, sizeof(hdr)); ++ exit(EXIT_FAILURE); ++ } ++ ++ fdt_bufsize -= sizeof(hdr); ++ ++ fdt = spapr_build_fdt(spapr, false, fdt_bufsize); ++ _FDT((fdt_pack(fdt))); ++ ++ cpu_physical_memory_write(fdt_buf, &hdr, sizeof(hdr)); ++ cpu_physical_memory_write(fdt_buf + sizeof(hdr), fdt, ++ fdt_totalsize(fdt)); ++ trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); ++ ++ g_free(spapr->fdt_blob); ++ spapr->fdt_size = fdt_totalsize(fdt); ++ spapr->fdt_initial_size = spapr->fdt_size; ++ spapr->fdt_blob = fdt; + } + + spapr_ovec_cleanup(ov5_updates); +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index e047dabf30..e5e2a99046 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -767,11 +767,9 @@ struct SpaprEventLogEntry { + QTAILQ_ENTRY(SpaprEventLogEntry) next; + }; + ++void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space); + void spapr_events_init(SpaprMachineState *sm); + void spapr_dt_events(SpaprMachineState *sm, void *fdt); +-int spapr_h_cas_compose_response(SpaprMachineState *sm, +- target_ulong addr, target_ulong size, +- SpaprOptionVector *ov5_updates); + void close_htab_fd(SpaprMachineState *spapr); + void spapr_setup_hpt_and_vrma(SpaprMachineState *spapr); + void spapr_free_hpt(SpaprMachineState *spapr); +-- +2.18.2 + diff --git a/kvm-spapr-Improve-handling-of-fdt-buffer-size.patch b/kvm-spapr-Improve-handling-of-fdt-buffer-size.patch new file mode 100755 index 0000000..2f57cde --- /dev/null +++ b/kvm-spapr-Improve-handling-of-fdt-buffer-size.patch @@ -0,0 +1,125 @@ +From 04f7fe2423a4de8d2fea7068b3fb316e15e76eaa Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:49 -0500 +Subject: [PATCH 1/9] spapr: Improve handling of fdt buffer size + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-2-gkurz@redhat.com> +Patchwork-id: 100682 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 1/6] spapr: Improve handling of fdt buffer size +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: David Gibson + +Previously, spapr_build_fdt() constructed the device tree in a fixed +buffer of size FDT_MAX_SIZE. This is a bit inflexible, but more +importantly it's awkward for the case where we use it during CAS. In +that case the guest firmware supplies a buffer and we have to +awkwardly check that what we generated fits into it afterwards, after +doing a lot of size checks during spapr_build_fdt(). + +Simplify this by having spapr_build_fdt() take a 'space' parameter. +For the CAS case, we pass in the buffer size provided by SLOF, for the +machine init case, we continue to pass FDT_MAX_SIZE. + +Signed-off-by: David Gibson +Reviewed-by: Cedric Le Goater +Reviewed-by: Greg Kurz +(cherry picked from commit 97b32a6afa78ae68fb16344b9a144b6f433f42a2) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 33 +++++++++++---------------------- + 1 file changed, 11 insertions(+), 22 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index c74079702d..92f63ad035 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -918,7 +918,8 @@ static bool spapr_hotplugged_dev_before_cas(void) + return false; + } + +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset); ++static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, ++ size_t space); + + int spapr_h_cas_compose_response(SpaprMachineState *spapr, + target_ulong addr, target_ulong size, +@@ -931,24 +932,17 @@ int spapr_h_cas_compose_response(SpaprMachineState *spapr, + return 1; + } + +- if (size < sizeof(hdr) || size > FW_MAX_SIZE) { +- error_report("SLOF provided an unexpected CAS buffer size " +- TARGET_FMT_lu " (min: %zu, max: %u)", +- size, sizeof(hdr), FW_MAX_SIZE); ++ if (size < sizeof(hdr)) { ++ error_report("SLOF provided insufficient CAS buffer " ++ TARGET_FMT_lu " (min: %zu)", size, sizeof(hdr)); + exit(EXIT_FAILURE); + } + + size -= sizeof(hdr); + +- fdt = spapr_build_fdt(spapr, false); ++ fdt = spapr_build_fdt(spapr, false, size); + _FDT((fdt_pack(fdt))); + +- if (fdt_totalsize(fdt) + sizeof(hdr) > size) { +- g_free(fdt); +- trace_spapr_cas_failed(size); +- return -1; +- } +- + cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); + cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); + trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); +@@ -1198,7 +1192,8 @@ static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt) + } + } + +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset) ++static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, ++ size_t space) + { + MachineState *machine = MACHINE(spapr); + MachineClass *mc = MACHINE_GET_CLASS(machine); +@@ -1208,8 +1203,8 @@ static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset) + SpaprPhbState *phb; + char *buf; + +- fdt = g_malloc0(FDT_MAX_SIZE); +- _FDT((fdt_create_empty_tree(fdt, FDT_MAX_SIZE))); ++ fdt = g_malloc0(space); ++ _FDT((fdt_create_empty_tree(fdt, space))); + + /* Root node */ + _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp")); +@@ -1724,19 +1719,13 @@ static void spapr_machine_reset(MachineState *machine) + */ + fdt_addr = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FDT_MAX_SIZE; + +- fdt = spapr_build_fdt(spapr, true); ++ fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE); + + rc = fdt_pack(fdt); + + /* Should only fail if we've built a corrupted tree */ + assert(rc == 0); + +- if (fdt_totalsize(fdt) > FDT_MAX_SIZE) { +- error_report("FDT too big ! 0x%x bytes (max is 0x%x)", +- fdt_totalsize(fdt), FDT_MAX_SIZE); +- exit(1); +- } +- + /* Load the fdt */ + qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt)); + cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); +-- +2.18.2 + diff --git a/kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch b/kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch new file mode 100755 index 0000000..b4b2b5f --- /dev/null +++ b/kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch @@ -0,0 +1,170 @@ +From f94b3a4eb9d709f1f6a14ad9ad6ebcc1b67b6923 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:54 -0500 +Subject: [PATCH 6/9] spapr: Improve handling of memory unplug with old guests + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-7-gkurz@redhat.com> +Patchwork-id: 100684 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 6/6] spapr: Improve handling of memory unplug with old guests +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +Since commit 1e8b5b1aa16b ("spapr: Allow memory unplug to always succeed") +trying to unplug memory from a guest that doesn't support it (eg. rhel6) +no longer generates an error like it used to. Instead, it leaves the +memory around : only a subsequent reboot or manual use of drmgr within +the guest can complete the hot-unplug sequence. A flag was added to +SpaprMachineClass so that this new behavior only applies to the default +machine type. + +We can do better. CAS processes all pending hot-unplug requests. This +means that we don't really care about what the guest supports if +the hot-unplug request happens before CAS. + +All guests that we care for, even old ones, set enough bits in OV5 +that lead to a non-empty bitmap in spapr->ov5_cas. Use that as a +heuristic to decide if CAS has already occured or not. + +Always accept unplug requests that happen before CAS since CAS will +process them. Restore the previous behavior of rejecting them after +CAS when we know that the guest doesn't support memory hot-unplug. + +This behavior is suitable for all machine types : this allows to +drop the pre_6_0_memory_unplug flag. + +Fixes: 1e8b5b1aa16b ("spapr: Allow memory unplug to always succeed") +Signed-off-by: Greg Kurz +Message-Id: <161012708715.801107.11418801796987916516.stgit@bahia.lan> +Reviewed-by: Daniel Henrique Barboza +Signed-off-by: David Gibson +(cherry picked from commit 73598c75df0585e039825e642adede21912dabc7) +Signed-off-by: Greg Kurz + +Conflicts: + hw/ppc/spapr.c + include/hw/ppc/spapr.h + +Contextual conflicts around the removal of pre_6_0_memory_unplug, +which was only partially backported from upstream 1e8b5b1aa16b, and +the addition of spapr_memory_hot_unplug_supported(). + +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 21 +++++++++++++-------- + hw/ppc/spapr_events.c | 3 +-- + hw/ppc/spapr_ovec.c | 7 +++++++ + include/hw/ppc/spapr.h | 2 +- + include/hw/ppc/spapr_ovec.h | 1 + + 5 files changed, 23 insertions(+), 11 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index f8de33e3e5..00b1ef075e 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -3993,6 +3993,18 @@ static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev, + } + } + ++bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr) ++{ ++ return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) || ++ /* ++ * CAS will process all pending unplug requests. ++ * ++ * HACK: a guest could theoretically have cleared all bits in OV5, ++ * but none of the guests we care for do. ++ */ ++ spapr_ovec_empty(spapr->ov5_cas); ++} ++ + static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) + { +@@ -4001,16 +4013,9 @@ static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { +- if (!smc->pre_6_0_memory_unplug || +- spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) { ++ if (spapr_memory_hot_unplug_supported(sms)) { + spapr_memory_unplug_request(hotplug_dev, dev, errp); + } else { +- /* NOTE: this means there is a window after guest reset, prior to +- * CAS negotiation, where unplug requests will fail due to the +- * capability not being detected yet. This is a bit different than +- * the case with PCI unplug, where the events will be queued and +- * eventually handled by the guest after boot +- */ + error_setg(errp, "Memory hot unplug not supported for this guest"); + } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) { +diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c +index 6e284aa4bc..08168acd65 100644 +--- a/hw/ppc/spapr_events.c ++++ b/hw/ppc/spapr_events.c +@@ -547,8 +547,7 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action, + /* we should not be using count_indexed value unless the guest + * supports dedicated hotplug event source + */ +- g_assert(!SPAPR_MACHINE_GET_CLASS(spapr)->pre_6_0_memory_unplug || +- spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT)); ++ g_assert(spapr_memory_hot_unplug_supported(spapr)); + hp->drc_id.count_indexed.count = + cpu_to_be32(drc_id->count_indexed.count); + hp->drc_id.count_indexed.index = +diff --git a/hw/ppc/spapr_ovec.c b/hw/ppc/spapr_ovec.c +index 811fadf143..f858afc7d5 100644 +--- a/hw/ppc/spapr_ovec.c ++++ b/hw/ppc/spapr_ovec.c +@@ -135,6 +135,13 @@ bool spapr_ovec_test(SpaprOptionVector *ov, long bitnr) + return test_bit(bitnr, ov->bitmap) ? true : false; + } + ++bool spapr_ovec_empty(SpaprOptionVector *ov) ++{ ++ g_assert(ov); ++ ++ return bitmap_empty(ov->bitmap, OV_MAXBITS); ++} ++ + static void guest_byte_to_bitmap(uint8_t entry, unsigned long *bitmap, + long bitmap_offset) + { +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index ac6961ed16..7aaf5d9996 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -124,7 +124,6 @@ struct SpaprMachineClass { + bool pre_4_1_migration; /* don't migrate hpt-max-page-size */ + bool linux_pci_probe; + bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ +- bool pre_6_0_memory_unplug; + + bool has_power9_support; + void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, +@@ -894,4 +893,5 @@ void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, + #define SPAPR_OV5_XIVE_BOTH 0x80 /* Only to advertise on the platform */ + + void spapr_set_all_lpcrs(target_ulong value, target_ulong mask); ++bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr); + #endif /* HW_SPAPR_H */ +diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h +index 7891e9caac..98c73bf601 100644 +--- a/include/hw/ppc/spapr_ovec.h ++++ b/include/hw/ppc/spapr_ovec.h +@@ -73,6 +73,7 @@ void spapr_ovec_cleanup(SpaprOptionVector *ov); + void spapr_ovec_set(SpaprOptionVector *ov, long bitnr); + void spapr_ovec_clear(SpaprOptionVector *ov, long bitnr); + bool spapr_ovec_test(SpaprOptionVector *ov, long bitnr); ++bool spapr_ovec_empty(SpaprOptionVector *ov); + SpaprOptionVector *spapr_ovec_parse_vector(target_ulong table_addr, int vector); + int spapr_ovec_populate_dt(void *fdt, int fdt_offset, + SpaprOptionVector *ov, const char *name); +-- +2.18.2 + diff --git a/kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch b/kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch new file mode 100755 index 0000000..7c48718 --- /dev/null +++ b/kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch @@ -0,0 +1,213 @@ +From 5aea41b56f07f586e0f56a5c8b3e8443e485cd77 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Jun 2020 07:41:09 -0400 +Subject: [PATCH 39/42] spapr: Pass the maximum number of vCPUs to the KVM + interrupt controller +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200605074111.2185-2-thuth@redhat.com> +Patchwork-id: 97368 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/3] spapr: Pass the maximum number of vCPUs to the KVM interrupt controller +Bugzilla: 1756946 +RH-Acked-by: Greg Kurz +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Greg Kurz + +The XIVE and XICS-on-XIVE KVM devices on POWER9 hosts can greatly reduce +their consumption of some scarce HW resources, namely Virtual Presenter +identifiers, if they know the maximum number of vCPUs that may run in the +VM. + +Prepare ground for this by passing the value down to xics_kvm_connect() +and kvmppc_xive_connect(). This is purely mechanical, no functional +change. + +Signed-off-by: Greg Kurz +Message-Id: <157478678301.67101.2717368060417156338.stgit@bahia.tlslab.ibm.com> +Reviewed-by: Cédric Le Goater +Signed-off-by: David Gibson +(cherry picked from commit 4ffb7496881ec361deaf1f51c41a933bde3cbf7b) +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/spapr_xive.c | 6 ++++-- + hw/intc/spapr_xive_kvm.c | 3 ++- + hw/intc/xics_kvm.c | 3 ++- + hw/intc/xics_spapr.c | 5 +++-- + hw/ppc/spapr_irq.c | 8 +++++--- + include/hw/ppc/spapr_irq.h | 10 ++++++++-- + include/hw/ppc/spapr_xive.h | 3 ++- + include/hw/ppc/xics_spapr.h | 3 ++- + 8 files changed, 28 insertions(+), 13 deletions(-) + +diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c +index 9cb8d38a3b..a570e6e90a 100644 +--- a/hw/intc/spapr_xive.c ++++ b/hw/intc/spapr_xive.c +@@ -651,12 +651,14 @@ static void spapr_xive_dt(SpaprInterruptController *intc, uint32_t nr_servers, + plat_res_int_priorities, sizeof(plat_res_int_priorities))); + } + +-static int spapr_xive_activate(SpaprInterruptController *intc, Error **errp) ++static int spapr_xive_activate(SpaprInterruptController *intc, ++ uint32_t nr_servers, Error **errp) + { + SpaprXive *xive = SPAPR_XIVE(intc); + + if (kvm_enabled()) { +- int rc = spapr_irq_init_kvm(kvmppc_xive_connect, intc, errp); ++ int rc = spapr_irq_init_kvm(kvmppc_xive_connect, intc, nr_servers, ++ errp); + if (rc < 0) { + return rc; + } +diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c +index 08012ac7cd..c1c837a764 100644 +--- a/hw/intc/spapr_xive_kvm.c ++++ b/hw/intc/spapr_xive_kvm.c +@@ -740,7 +740,8 @@ static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, + * All the XIVE memory regions are now backed by mappings from the KVM + * XIVE device. + */ +-int kvmppc_xive_connect(SpaprInterruptController *intc, Error **errp) ++int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp) + { + SpaprXive *xive = SPAPR_XIVE(intc); + XiveSource *xsrc = &xive->source; +diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c +index 954c424b36..a1f1b7b0d3 100644 +--- a/hw/intc/xics_kvm.c ++++ b/hw/intc/xics_kvm.c +@@ -342,7 +342,8 @@ void ics_kvm_set_irq(ICSState *ics, int srcno, int val) + } + } + +-int xics_kvm_connect(SpaprInterruptController *intc, Error **errp) ++int xics_kvm_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp) + { + ICSState *ics = ICS_SPAPR(intc); + int rc; +diff --git a/hw/intc/xics_spapr.c b/hw/intc/xics_spapr.c +index b3705dab0e..8ae4f41459 100644 +--- a/hw/intc/xics_spapr.c ++++ b/hw/intc/xics_spapr.c +@@ -422,10 +422,11 @@ static int xics_spapr_post_load(SpaprInterruptController *intc, int version_id) + return 0; + } + +-static int xics_spapr_activate(SpaprInterruptController *intc, Error **errp) ++static int xics_spapr_activate(SpaprInterruptController *intc, ++ uint32_t nr_servers, Error **errp) + { + if (kvm_enabled()) { +- return spapr_irq_init_kvm(xics_kvm_connect, intc, errp); ++ return spapr_irq_init_kvm(xics_kvm_connect, intc, nr_servers, errp); + } + return 0; + } +diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c +index d6bb7fd2d6..9da423658a 100644 +--- a/hw/ppc/spapr_irq.c ++++ b/hw/ppc/spapr_irq.c +@@ -70,15 +70,16 @@ void spapr_irq_msi_free(SpaprMachineState *spapr, int irq, uint32_t num) + bitmap_clear(spapr->irq_map, irq - SPAPR_IRQ_MSI, num); + } + +-int spapr_irq_init_kvm(int (*fn)(SpaprInterruptController *, Error **), ++int spapr_irq_init_kvm(SpaprInterruptControllerInitKvm fn, + SpaprInterruptController *intc, ++ uint32_t nr_servers, + Error **errp) + { + MachineState *machine = MACHINE(qdev_get_machine()); + Error *local_err = NULL; + + if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { +- if (fn(intc, &local_err) < 0) { ++ if (fn(intc, nr_servers, &local_err) < 0) { + if (machine_kernel_irqchip_required(machine)) { + error_prepend(&local_err, + "kernel_irqchip requested but unavailable: "); +@@ -495,6 +496,7 @@ static void set_active_intc(SpaprMachineState *spapr, + SpaprInterruptController *new_intc) + { + SpaprInterruptControllerClass *sicc; ++ uint32_t nr_servers = spapr_max_server_number(spapr); + + assert(new_intc); + +@@ -512,7 +514,7 @@ static void set_active_intc(SpaprMachineState *spapr, + + sicc = SPAPR_INTC_GET_CLASS(new_intc); + if (sicc->activate) { +- sicc->activate(new_intc, &error_fatal); ++ sicc->activate(new_intc, nr_servers, &error_fatal); + } + + spapr->active_intc = new_intc; +diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h +index ff814d13de..ca8cb44213 100644 +--- a/include/hw/ppc/spapr_irq.h ++++ b/include/hw/ppc/spapr_irq.h +@@ -43,7 +43,8 @@ typedef struct SpaprInterruptController SpaprInterruptController; + typedef struct SpaprInterruptControllerClass { + InterfaceClass parent; + +- int (*activate)(SpaprInterruptController *intc, Error **errp); ++ int (*activate)(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp); + void (*deactivate)(SpaprInterruptController *intc); + + /* +@@ -98,8 +99,13 @@ qemu_irq spapr_qirq(SpaprMachineState *spapr, int irq); + int spapr_irq_post_load(SpaprMachineState *spapr, int version_id); + void spapr_irq_reset(SpaprMachineState *spapr, Error **errp); + int spapr_irq_get_phandle(SpaprMachineState *spapr, void *fdt, Error **errp); +-int spapr_irq_init_kvm(int (*fn)(SpaprInterruptController *, Error **), ++ ++typedef int (*SpaprInterruptControllerInitKvm)(SpaprInterruptController *, ++ uint32_t, Error **); ++ ++int spapr_irq_init_kvm(SpaprInterruptControllerInitKvm fn, + SpaprInterruptController *intc, ++ uint32_t nr_servers, + Error **errp); + + /* +diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h +index 742b7e834f..3a103c224d 100644 +--- a/include/hw/ppc/spapr_xive.h ++++ b/include/hw/ppc/spapr_xive.h +@@ -66,7 +66,8 @@ int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx, + /* + * KVM XIVE device helpers + */ +-int kvmppc_xive_connect(SpaprInterruptController *intc, Error **errp); ++int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp); + void kvmppc_xive_disconnect(SpaprInterruptController *intc); + void kvmppc_xive_reset(SpaprXive *xive, Error **errp); + void kvmppc_xive_set_source_config(SpaprXive *xive, uint32_t lisn, XiveEAS *eas, +diff --git a/include/hw/ppc/xics_spapr.h b/include/hw/ppc/xics_spapr.h +index 28b87038c8..1c65c96e3c 100644 +--- a/include/hw/ppc/xics_spapr.h ++++ b/include/hw/ppc/xics_spapr.h +@@ -32,7 +32,8 @@ + #define TYPE_ICS_SPAPR "ics-spapr" + #define ICS_SPAPR(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_SPAPR) + +-int xics_kvm_connect(SpaprInterruptController *intc, Error **errp); ++int xics_kvm_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp); + void xics_kvm_disconnect(SpaprInterruptController *intc); + bool xics_kvm_has_broken_disconnect(SpaprMachineState *spapr); + +-- +2.27.0 + diff --git a/kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch b/kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch new file mode 100755 index 0000000..4f15509 --- /dev/null +++ b/kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch @@ -0,0 +1,50 @@ +From b46fdf56b1a7938468565838bdadf260870e4f9b Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Wed, 9 Jun 2021 10:05:00 -0400 +Subject: [PATCH 3/4] spapr: Remove stale comment about power-saving LPCR bits +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210609100501.427096-2-lvivier@redhat.com> +Patchwork-id: 101682 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] spapr: Remove stale comment about power-saving LPCR bits +Bugzilla: 1969768 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: David Gibson +RH-Acked-by: Greg Kurz + +From: Nicholas Piggin + +Commit 47a9b551547 ("spapr: Clean up handling of LPCR power-saving exit +bits") moved this logic but did not remove the comment from the +previous location. + +Signed-off-by: Nicholas Piggin +Message-Id: <20210526091626.3388262-2-npiggin@gmail.com> +Reviewed-by: Cédric Le Goater +Reviewed-by: Greg Kurz +Signed-off-by: David Gibson +(cherry picked from commit 7be3bf6c8429969f97728bb712d9a99997835607) +Signed-off-by: Laurent Vivier +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_rtas.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c +index 8d8d8cdfcb..295eac986e 100644 +--- a/hw/ppc/spapr_rtas.c ++++ b/hw/ppc/spapr_rtas.c +@@ -163,7 +163,6 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + + env->msr = (1ULL << MSR_SF) | (1ULL << MSR_ME); + +- /* Enable Power-saving mode Exit Cause exceptions for the new CPU */ + lpcr = env->spr[SPR_LPCR]; + if (!pcc->interrupts_big_endian(callcpu)) { + lpcr |= LPCR_ILE; +-- +2.27.0 + diff --git a/kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch b/kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch new file mode 100755 index 0000000..84abc74 --- /dev/null +++ b/kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch @@ -0,0 +1,89 @@ +From 28794dca79a94d01c8732b84fe6ac6ba2986ce45 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Wed, 9 Jun 2021 10:05:01 -0400 +Subject: [PATCH 4/4] spapr: Set LPCR to current AIL mode when starting a new + CPU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210609100501.427096-3-lvivier@redhat.com> +Patchwork-id: 101683 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] spapr: Set LPCR to current AIL mode when starting a new CPU +Bugzilla: 1969768 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: David Gibson +RH-Acked-by: Greg Kurz + +From: Nicholas Piggin + +TCG does not keep track of AIL mode in a central place, it's based on +the current LPCR[AIL] bits. Synchronize the new CPU's LPCR to the +current LPCR in rtas_start_cpu(), similarly to the way the ILE bit is +synchronized. + +Open-code the ILE setting as well now that the caller's LPCR is +available directly, there is no need for the indirection. + +Without this, under both TCG and KVM, adding a POWER8/9/10 class CPU +with a new core ID after a modern Linux has booted results in the new +CPU's LPCR missing the LPCR[AIL]=0b11 setting that the other CPUs have. +This can cause crashes and unexpected behaviour. + +Signed-off-by: Nicholas Piggin +Message-Id: <20210526091626.3388262-3-npiggin@gmail.com> +Reviewed-by: Cédric Le Goater +Reviewed-by: Greg Kurz +Signed-off-by: David Gibson +(cherry picked from commit ac559ecbea2649819e7b3fdd09f4e0243e0128db) +Signed-off-by: Laurent Vivier +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_rtas.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c +index 295eac986e..5acb7c1f10 100644 +--- a/hw/ppc/spapr_rtas.c ++++ b/hw/ppc/spapr_rtas.c +@@ -132,8 +132,8 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + target_ulong id, start, r3; + PowerPCCPU *newcpu; + CPUPPCState *env; +- PowerPCCPUClass *pcc; + target_ulong lpcr; ++ target_ulong caller_lpcr; + + if (nargs != 3 || nret != 1) { + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +@@ -152,7 +152,6 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + } + + env = &newcpu->env; +- pcc = POWERPC_CPU_GET_CLASS(newcpu); + + if (!CPU(newcpu)->halted) { + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); +@@ -163,10 +162,15 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + + env->msr = (1ULL << MSR_SF) | (1ULL << MSR_ME); + ++ caller_lpcr = callcpu->env.spr[SPR_LPCR]; + lpcr = env->spr[SPR_LPCR]; +- if (!pcc->interrupts_big_endian(callcpu)) { +- lpcr |= LPCR_ILE; +- } ++ ++ /* Set ILE the same way */ ++ lpcr = (lpcr & ~LPCR_ILE) | (caller_lpcr & LPCR_ILE); ++ ++ /* Set AIL the same way */ ++ lpcr = (lpcr & ~LPCR_AIL) | (caller_lpcr & LPCR_AIL); ++ + if (env->mmu_model == POWERPC_MMU_3_00) { + /* + * New cpus are expected to start in the same radix/hash mode +-- +2.27.0 + diff --git a/kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch b/kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch new file mode 100755 index 0000000..e8c9f8b --- /dev/null +++ b/kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch @@ -0,0 +1,54 @@ +From 07df0f52c26a3819bc02b4f2970b6735bcf15c5b Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:42 -0400 +Subject: [PATCH 4/9] sungem: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-5-jmaloy@redhat.com> +Patchwork-id: 101786 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 4/9] sungem: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Mark Cave-Ayland +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Alistair Francis +Signed-off-by: Jason Wang + +(cherry picked from commit 8c92060d3c0248bd4d515719a35922cd2391b9b4) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/sungem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/sungem.c b/hw/net/sungem.c +index f31d41ac5b..8b202b5c15 100644 +--- a/hw/net/sungem.c ++++ b/hw/net/sungem.c +@@ -305,7 +305,7 @@ static void sungem_send_packet(SunGEMState *s, const uint8_t *buf, + NetClientState *nc = qemu_get_queue(s->nic); + + if (s->macregs[MAC_XIFCFG >> 2] & MAC_XIFCFG_LBCK) { +- nc->info->receive(nc, buf, size); ++ qemu_receive_packet(nc, buf, size); + } else { + qemu_send_packet(nc, buf, size); + } +-- +2.27.0 + diff --git a/kvm-target-arm-Fix-PAuth-sbox-functions.patch b/kvm-target-arm-Fix-PAuth-sbox-functions.patch new file mode 100755 index 0000000..0e08184 --- /dev/null +++ b/kvm-target-arm-Fix-PAuth-sbox-functions.patch @@ -0,0 +1,65 @@ +From b8c8288a65146952cdfe7d5f0cd96734c9de8ee1 Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Thu, 7 May 2020 17:57:08 +0100 +Subject: [PATCH 1/7] target/arm: Fix PAuth sbox functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200507175708.1165177-2-jmaloy@redhat.com> +Patchwork-id: 96341 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] target/arm: Fix PAuth sbox functions +Bugzilla: 1813940 +RH-Acked-by: Andrew Jones +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella + +From: Vincent Dehors + +In the PAC computation, sbox was applied over wrong bits. +As this is a 4-bit sbox, bit index should be incremented by 4 instead of 16. + +Test vector from QARMA paper (https://eprint.iacr.org/2016/444.pdf) was +used to verify one computation of the pauth_computepac() function which +uses sbox2. + +Launchpad: https://bugs.launchpad.net/bugs/1859713 +Reviewed-by: Richard Henderson +Signed-off-by: Vincent DEHORS +Signed-off-by: Adrien GRASSEIN +Message-id: 20200116230809.19078-2-richard.henderson@linaro.org +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell +(cherry picked from commit de0b1bae6461f67243282555475f88b2384a1eb9) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/pauth_helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c +index d3194f2..0a5f41e 100644 +--- a/target/arm/pauth_helper.c ++++ b/target/arm/pauth_helper.c +@@ -89,7 +89,7 @@ static uint64_t pac_sub(uint64_t i) + uint64_t o = 0; + int b; + +- for (b = 0; b < 64; b += 16) { ++ for (b = 0; b < 64; b += 4) { + o |= (uint64_t)sub[(i >> b) & 0xf] << b; + } + return o; +@@ -104,7 +104,7 @@ static uint64_t pac_inv_sub(uint64_t i) + uint64_t o = 0; + int b; + +- for (b = 0; b < 64; b += 16) { ++ for (b = 0; b < 64; b += 4) { + o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b; + } + return o; +-- +1.8.3.1 + diff --git a/kvm-target-arm-arch_dump-Add-SVE-notes.patch b/kvm-target-arm-arch_dump-Add-SVE-notes.patch new file mode 100755 index 0000000..febea10 --- /dev/null +++ b/kvm-target-arm-arch_dump-Add-SVE-notes.patch @@ -0,0 +1,298 @@ +From d8871ae2842531130c9b333e7c06a6a5d1561286 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 24 Jan 2020 09:14:34 +0100 +Subject: [PATCH 001/116] target/arm/arch_dump: Add SVE notes + +RH-Author: Andrew Jones +Message-id: <20200124091434.15021-2-drjones@redhat.com> +Patchwork-id: 93443 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] target/arm/arch_dump: Add SVE notes +Bugzilla: 1725084 +RH-Acked-by: Auger Eric +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1725084 + +Author: Andrew Jones +Date: Thu, 23 Jan 2020 15:22:40 +0000 + + target/arm/arch_dump: Add SVE notes + + When dumping a guest with dump-guest-memory also dump the SVE + registers if they are in use. + + Signed-off-by: Andrew Jones + Reviewed-by: Richard Henderson + Message-id: 20200120101832.18781-1-drjones@redhat.com + [PMM: fixed checkpatch nits] + Signed-off-by: Peter Maydell + +(cherry picked from commit 538baab245ca881e6a6ff720b5133f3ad1fcaafc) +Signed-off-by: Miroslav Rezanina +--- + include/elf.h | 1 + + target/arm/arch_dump.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++- + target/arm/cpu.h | 25 ++++++++++ + target/arm/kvm64.c | 24 ---------- + 4 files changed, 148 insertions(+), 26 deletions(-) + +diff --git a/include/elf.h b/include/elf.h +index 3501e0c..8fbfe60 100644 +--- a/include/elf.h ++++ b/include/elf.h +@@ -1650,6 +1650,7 @@ typedef struct elf64_shdr { + #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ + #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ + #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ ++#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension regs */ + + /* + * Physical entry point into the kernel. +diff --git a/target/arm/arch_dump.c b/target/arm/arch_dump.c +index 26a2c09..2345dec 100644 +--- a/target/arm/arch_dump.c ++++ b/target/arm/arch_dump.c +@@ -62,12 +62,23 @@ struct aarch64_user_vfp_state { + + QEMU_BUILD_BUG_ON(sizeof(struct aarch64_user_vfp_state) != 528); + ++/* struct user_sve_header from arch/arm64/include/uapi/asm/ptrace.h */ ++struct aarch64_user_sve_header { ++ uint32_t size; ++ uint32_t max_size; ++ uint16_t vl; ++ uint16_t max_vl; ++ uint16_t flags; ++ uint16_t reserved; ++} QEMU_PACKED; ++ + struct aarch64_note { + Elf64_Nhdr hdr; + char name[8]; /* align_up(sizeof("CORE"), 4) */ + union { + struct aarch64_elf_prstatus prstatus; + struct aarch64_user_vfp_state vfp; ++ struct aarch64_user_sve_header sve; + }; + } QEMU_PACKED; + +@@ -76,6 +87,8 @@ struct aarch64_note { + (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_elf_prstatus)) + #define AARCH64_PRFPREG_NOTE_SIZE \ + (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_user_vfp_state)) ++#define AARCH64_SVE_NOTE_SIZE(env) \ ++ (AARCH64_NOTE_HEADER_SIZE + sve_size(env)) + + static void aarch64_note_init(struct aarch64_note *note, DumpState *s, + const char *name, Elf64_Word namesz, +@@ -128,11 +141,102 @@ static int aarch64_write_elf64_prfpreg(WriteCoreDumpFunction f, + return 0; + } + ++#ifdef TARGET_AARCH64 ++static off_t sve_zreg_offset(uint32_t vq, int n) ++{ ++ off_t off = sizeof(struct aarch64_user_sve_header); ++ return ROUND_UP(off, 16) + vq * 16 * n; ++} ++ ++static off_t sve_preg_offset(uint32_t vq, int n) ++{ ++ return sve_zreg_offset(vq, 32) + vq * 16 / 8 * n; ++} ++ ++static off_t sve_fpsr_offset(uint32_t vq) ++{ ++ off_t off = sve_preg_offset(vq, 17); ++ return ROUND_UP(off, 16); ++} ++ ++static off_t sve_fpcr_offset(uint32_t vq) ++{ ++ return sve_fpsr_offset(vq) + sizeof(uint32_t); ++} ++ ++static uint32_t sve_current_vq(CPUARMState *env) ++{ ++ return sve_zcr_len_for_el(env, arm_current_el(env)) + 1; ++} ++ ++static size_t sve_size_vq(uint32_t vq) ++{ ++ off_t off = sve_fpcr_offset(vq) + sizeof(uint32_t); ++ return ROUND_UP(off, 16); ++} ++ ++static size_t sve_size(CPUARMState *env) ++{ ++ return sve_size_vq(sve_current_vq(env)); ++} ++ ++static int aarch64_write_elf64_sve(WriteCoreDumpFunction f, ++ CPUARMState *env, int cpuid, ++ DumpState *s) ++{ ++ struct aarch64_note *note; ++ ARMCPU *cpu = env_archcpu(env); ++ uint32_t vq = sve_current_vq(env); ++ uint64_t tmp[ARM_MAX_VQ * 2], *r; ++ uint32_t fpr; ++ uint8_t *buf; ++ int ret, i; ++ ++ note = g_malloc0(AARCH64_SVE_NOTE_SIZE(env)); ++ buf = (uint8_t *)¬e->sve; ++ ++ aarch64_note_init(note, s, "LINUX", 6, NT_ARM_SVE, sve_size_vq(vq)); ++ ++ note->sve.size = cpu_to_dump32(s, sve_size_vq(vq)); ++ note->sve.max_size = cpu_to_dump32(s, sve_size_vq(cpu->sve_max_vq)); ++ note->sve.vl = cpu_to_dump16(s, vq * 16); ++ note->sve.max_vl = cpu_to_dump16(s, cpu->sve_max_vq * 16); ++ note->sve.flags = cpu_to_dump16(s, 1); ++ ++ for (i = 0; i < 32; ++i) { ++ r = sve_bswap64(tmp, &env->vfp.zregs[i].d[0], vq * 2); ++ memcpy(&buf[sve_zreg_offset(vq, i)], r, vq * 16); ++ } ++ ++ for (i = 0; i < 17; ++i) { ++ r = sve_bswap64(tmp, r = &env->vfp.pregs[i].p[0], ++ DIV_ROUND_UP(vq * 2, 8)); ++ memcpy(&buf[sve_preg_offset(vq, i)], r, vq * 16 / 8); ++ } ++ ++ fpr = cpu_to_dump32(s, vfp_get_fpsr(env)); ++ memcpy(&buf[sve_fpsr_offset(vq)], &fpr, sizeof(uint32_t)); ++ ++ fpr = cpu_to_dump32(s, vfp_get_fpcr(env)); ++ memcpy(&buf[sve_fpcr_offset(vq)], &fpr, sizeof(uint32_t)); ++ ++ ret = f(note, AARCH64_SVE_NOTE_SIZE(env), s); ++ g_free(note); ++ ++ if (ret < 0) { ++ return -1; ++ } ++ ++ return 0; ++} ++#endif ++ + int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, + int cpuid, void *opaque) + { + struct aarch64_note note; +- CPUARMState *env = &ARM_CPU(cs)->env; ++ ARMCPU *cpu = ARM_CPU(cs); ++ CPUARMState *env = &cpu->env; + DumpState *s = opaque; + uint64_t pstate, sp; + int ret, i; +@@ -163,7 +267,18 @@ int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, + return -1; + } + +- return aarch64_write_elf64_prfpreg(f, env, cpuid, s); ++ ret = aarch64_write_elf64_prfpreg(f, env, cpuid, s); ++ if (ret) { ++ return ret; ++ } ++ ++#ifdef TARGET_AARCH64 ++ if (cpu_isar_feature(aa64_sve, cpu)) { ++ ret = aarch64_write_elf64_sve(f, env, cpuid, s); ++ } ++#endif ++ ++ return ret; + } + + /* struct pt_regs from arch/arm/include/asm/ptrace.h */ +@@ -335,6 +450,11 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) + if (class == ELFCLASS64) { + note_size = AARCH64_PRSTATUS_NOTE_SIZE; + note_size += AARCH64_PRFPREG_NOTE_SIZE; ++#ifdef TARGET_AARCH64 ++ if (cpu_isar_feature(aa64_sve, cpu)) { ++ note_size += AARCH64_SVE_NOTE_SIZE(env); ++ } ++#endif + } else { + note_size = ARM_PRSTATUS_NOTE_SIZE; + if (arm_feature(env, ARM_FEATURE_VFP)) { +diff --git a/target/arm/cpu.h b/target/arm/cpu.h +index 83a809d..82dd3cc 100644 +--- a/target/arm/cpu.h ++++ b/target/arm/cpu.h +@@ -975,6 +975,31 @@ void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq); + void aarch64_sve_change_el(CPUARMState *env, int old_el, + int new_el, bool el0_a64); + void aarch64_add_sve_properties(Object *obj); ++ ++/* ++ * SVE registers are encoded in KVM's memory in an endianness-invariant format. ++ * The byte at offset i from the start of the in-memory representation contains ++ * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the ++ * lowest offsets are stored in the lowest memory addresses, then that nearly ++ * matches QEMU's representation, which is to use an array of host-endian ++ * uint64_t's, where the lower offsets are at the lower indices. To complete ++ * the translation we just need to byte swap the uint64_t's on big-endian hosts. ++ */ ++static inline uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) ++{ ++#ifdef HOST_WORDS_BIGENDIAN ++ int i; ++ ++ for (i = 0; i < nr; ++i) { ++ dst[i] = bswap64(src[i]); ++ } ++ ++ return dst; ++#else ++ return src; ++#endif ++} ++ + #else + static inline void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq) { } + static inline void aarch64_sve_change_el(CPUARMState *env, int o, +diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c +index 876184b..e2da756 100644 +--- a/target/arm/kvm64.c ++++ b/target/arm/kvm64.c +@@ -877,30 +877,6 @@ static int kvm_arch_put_fpsimd(CPUState *cs) + } + + /* +- * SVE registers are encoded in KVM's memory in an endianness-invariant format. +- * The byte at offset i from the start of the in-memory representation contains +- * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the +- * lowest offsets are stored in the lowest memory addresses, then that nearly +- * matches QEMU's representation, which is to use an array of host-endian +- * uint64_t's, where the lower offsets are at the lower indices. To complete +- * the translation we just need to byte swap the uint64_t's on big-endian hosts. +- */ +-static uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) +-{ +-#ifdef HOST_WORDS_BIGENDIAN +- int i; +- +- for (i = 0; i < nr; ++i) { +- dst[i] = bswap64(src[i]); +- } +- +- return dst; +-#else +- return src; +-#endif +-} +- +-/* + * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits + * and PREGS and the FFR have a slice size of 256 bits. However we simply hard + * code the slice index to zero for now as it's unlikely we'll need more than +-- +1.8.3.1 + diff --git a/kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch b/kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch new file mode 100755 index 0000000..601b8c4 --- /dev/null +++ b/kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch @@ -0,0 +1,281 @@ +From 730f72105b478553c4f22555c29b0f64224ff914 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:14 +0000 +Subject: [PATCH 12/15] target/arm/cpu: Add the kvm-no-adjvtime CPU property +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-6-drjones@redhat.com> +Patchwork-id: 93623 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/5] target/arm/cpu: Add the kvm-no-adjvtime CPU property +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + target/arm/cpu: Add the kvm-no-adjvtime CPU property + + kvm-no-adjvtime is a KVM specific CPU property and a first of its + kind. To accommodate it we also add kvm_arm_add_vcpu_properties() + and a KVM specific CPU properties description to the CPU features + document. + + Signed-off-by: Andrew Jones + Message-id: 20200120101023.16030-7-drjones@redhat.com + Reviewed-by: Peter Maydell + Signed-off-by: Peter Maydell + +(cherry picked from commit dea101a1ae9968c9fec6ab0291489dad7c49f36f) +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + Dropped the second hunk of the hw/arm/virt.c changes + as they would patch dead code. + +Signed-off-by: Danilo C. L. de Paula +--- + docs/arm-cpu-features.rst | 37 ++++++++++++++++++++++++++++++++++++- + hw/arm/virt.c | 5 +++++ + include/hw/arm/virt.h | 1 + + target/arm/cpu.c | 2 ++ + target/arm/cpu64.c | 1 + + target/arm/kvm.c | 28 ++++++++++++++++++++++++++++ + target/arm/kvm_arm.h | 11 +++++++++++ + target/arm/monitor.c | 1 + + tests/arm-cpu-features.c | 4 ++++ + 9 files changed, 89 insertions(+), 1 deletion(-) + +diff --git a/docs/arm-cpu-features.rst b/docs/arm-cpu-features.rst +index 1b367e2..45d1eb6 100644 +--- a/docs/arm-cpu-features.rst ++++ b/docs/arm-cpu-features.rst +@@ -31,7 +31,9 @@ supporting the feature or only supporting the feature under certain + configurations. For example, the `aarch64` CPU feature, which, when + disabled, enables the optional AArch32 CPU feature, is only supported + when using the KVM accelerator and when running on a host CPU type that +-supports the feature. ++supports the feature. While `aarch64` currently only works with KVM, ++it could work with TCG. CPU features that are specific to KVM are ++prefixed with "kvm-" and are described in "KVM VCPU Features". + + CPU Feature Probing + =================== +@@ -171,6 +173,39 @@ disabling many SVE vector lengths would be quite verbose, the `sve` CPU + properties have special semantics (see "SVE CPU Property Parsing + Semantics"). + ++KVM VCPU Features ++================= ++ ++KVM VCPU features are CPU features that are specific to KVM, such as ++paravirt features or features that enable CPU virtualization extensions. ++The features' CPU properties are only available when KVM is enabled and ++are named with the prefix "kvm-". KVM VCPU features may be probed, ++enabled, and disabled in the same way as other CPU features. Below is ++the list of KVM VCPU features and their descriptions. ++ ++ kvm-no-adjvtime By default kvm-no-adjvtime is disabled. This ++ means that by default the virtual time ++ adjustment is enabled (vtime is *not not* ++ adjusted). ++ ++ When virtual time adjustment is enabled each ++ time the VM transitions back to running state ++ the VCPU's virtual counter is updated to ensure ++ stopped time is not counted. This avoids time ++ jumps surprising guest OSes and applications, ++ as long as they use the virtual counter for ++ timekeeping. However it has the side effect of ++ the virtual and physical counters diverging. ++ All timekeeping based on the virtual counter ++ will appear to lag behind any timekeeping that ++ does not subtract VM stopped time. The guest ++ may resynchronize its virtual counter with ++ other time sources as needed. ++ ++ Enable kvm-no-adjvtime to disable virtual time ++ adjustment, also restoring the legacy (pre-5.0) ++ behavior. ++ + SVE CPU Properties + ================== + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index e108391..d30d38c 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -1707,6 +1707,11 @@ static void machvirt_init(MachineState *machine) + } + } + ++ if (vmc->kvm_no_adjvtime && ++ object_property_find(cpuobj, "kvm-no-adjvtime", NULL)) { ++ object_property_set_bool(cpuobj, true, "kvm-no-adjvtime", NULL); ++ } ++ + if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) { + object_property_set_bool(cpuobj, false, "pmu", NULL); + } +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index 53fdf16..77828ce 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -109,6 +109,7 @@ typedef struct { + bool smbios_old_sys_ver; + bool no_highmem_ecam; + bool no_ged; /* Machines < 4.2 has no support for ACPI GED device */ ++ bool kvm_no_adjvtime; + } VirtMachineClass; + + typedef struct { +diff --git a/target/arm/cpu.c b/target/arm/cpu.c +index 3788fc3..e46efe9 100644 +--- a/target/arm/cpu.c ++++ b/target/arm/cpu.c +@@ -2482,6 +2482,7 @@ static void arm_max_initfn(Object *obj) + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); ++ kvm_arm_add_vcpu_properties(obj); + } else { + cortex_a15_initfn(obj); + +@@ -2673,6 +2674,7 @@ static void arm_host_initfn(Object *obj) + if (arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) { + aarch64_add_sve_properties(obj); + } ++ kvm_arm_add_vcpu_properties(obj); + arm_cpu_post_init(obj); + } + +diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c +index a39d6fc..3cd416d 100644 +--- a/target/arm/cpu64.c ++++ b/target/arm/cpu64.c +@@ -605,6 +605,7 @@ static void aarch64_max_initfn(Object *obj) + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); ++ kvm_arm_add_vcpu_properties(obj); + } else { + uint64_t t; + uint32_t u; +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 26d7f8b..4be9497 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -17,6 +17,8 @@ + #include "qemu/timer.h" + #include "qemu/error-report.h" + #include "qemu/main-loop.h" ++#include "qom/object.h" ++#include "qapi/error.h" + #include "sysemu/sysemu.h" + #include "sysemu/kvm.h" + #include "sysemu/kvm_int.h" +@@ -179,6 +181,32 @@ void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) + env->features = arm_host_cpu_features.features; + } + ++static bool kvm_no_adjvtime_get(Object *obj, Error **errp) ++{ ++ return !ARM_CPU(obj)->kvm_adjvtime; ++} ++ ++static void kvm_no_adjvtime_set(Object *obj, bool value, Error **errp) ++{ ++ ARM_CPU(obj)->kvm_adjvtime = !value; ++} ++ ++/* KVM VCPU properties should be prefixed with "kvm-". */ ++void kvm_arm_add_vcpu_properties(Object *obj) ++{ ++ if (!kvm_enabled()) { ++ return; ++ } ++ ++ ARM_CPU(obj)->kvm_adjvtime = true; ++ object_property_add_bool(obj, "kvm-no-adjvtime", kvm_no_adjvtime_get, ++ kvm_no_adjvtime_set, &error_abort); ++ object_property_set_description(obj, "kvm-no-adjvtime", ++ "Set on to disable the adjustment of " ++ "the virtual counter. VM stopped time " ++ "will be counted.", &error_abort); ++} ++ + bool kvm_arm_pmu_supported(CPUState *cpu) + { + KVMState *s = KVM_STATE(current_machine->accelerator); +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index 01a9a18..ae9e075 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -256,6 +256,15 @@ void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map); + void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu); + + /** ++ * kvm_arm_add_vcpu_properties: ++ * @obj: The CPU object to add the properties to ++ * ++ * Add all KVM specific CPU properties to the CPU object. These ++ * are the CPU properties with "kvm-" prefixed names. ++ */ ++void kvm_arm_add_vcpu_properties(Object *obj); ++ ++/** + * kvm_arm_aarch32_supported: + * @cs: CPUState + * +@@ -345,6 +354,8 @@ static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) + cpu->host_cpu_probe_failed = true; + } + ++static inline void kvm_arm_add_vcpu_properties(Object *obj) {} ++ + static inline bool kvm_arm_aarch32_supported(CPUState *cs) + { + return false; +diff --git a/target/arm/monitor.c b/target/arm/monitor.c +index fa054f8..9725dff 100644 +--- a/target/arm/monitor.c ++++ b/target/arm/monitor.c +@@ -103,6 +103,7 @@ static const char *cpu_model_advertised_features[] = { + "sve128", "sve256", "sve384", "sve512", + "sve640", "sve768", "sve896", "sve1024", "sve1152", "sve1280", + "sve1408", "sve1536", "sve1664", "sve1792", "sve1920", "sve2048", ++ "kvm-no-adjvtime", + NULL + }; + +diff --git a/tests/arm-cpu-features.c b/tests/arm-cpu-features.c +index 89285ca..ba1a6fe 100644 +--- a/tests/arm-cpu-features.c ++++ b/tests/arm-cpu-features.c +@@ -428,6 +428,8 @@ static void test_query_cpu_model_expansion(const void *data) + assert_has_feature_enabled(qts, "cortex-a15", "pmu"); + assert_has_not_feature(qts, "cortex-a15", "aarch64"); + ++ assert_has_not_feature(qts, "max", "kvm-no-adjvtime"); ++ + if (g_str_equal(qtest_get_arch(), "aarch64")) { + assert_has_feature_enabled(qts, "max", "aarch64"); + assert_has_feature_enabled(qts, "max", "sve"); +@@ -462,6 +464,8 @@ static void test_query_cpu_model_expansion_kvm(const void *data) + return; + } + ++ assert_has_feature_disabled(qts, "host", "kvm-no-adjvtime"); ++ + if (g_str_equal(qtest_get_arch(), "aarch64")) { + bool kvm_supports_sve; + char max_name[8], name[8]; +-- +1.8.3.1 + diff --git a/kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch b/kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch new file mode 100755 index 0000000..3396a32 --- /dev/null +++ b/kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch @@ -0,0 +1,330 @@ +From 5388ea3fc0737d1a659256ff3663057bef484c19 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:13 +0000 +Subject: [PATCH 11/15] target/arm/kvm: Implement virtual time adjustment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-5-drjones@redhat.com> +Patchwork-id: 93622 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/5] target/arm/kvm: Implement virtual time adjustment +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + target/arm/kvm: Implement virtual time adjustment + + When a VM is stopped (such as when it's paused) guest virtual time + should stop counting. Otherwise, when the VM is resumed it will + experience time jumps and its kernel may report soft lockups. Not + counting virtual time while the VM is stopped has the side effect + of making the guest's time appear to lag when compared with real + time, and even with time derived from the physical counter. For + this reason, this change, which is enabled by default, comes with + a KVM CPU feature allowing it to be disabled, restoring legacy + behavior. + + This patch only provides the implementation of the virtual time + adjustment. A subsequent patch will provide the CPU property + allowing the change to be enabled and disabled. + + Reported-by: Bijan Mottahedeh + Signed-off-by: Andrew Jones + Message-id: 20200120101023.16030-6-drjones@redhat.com + Reviewed-by: Peter Maydell + Signed-off-by: Peter Maydell + +(cherry picked from commit e5ac4200b4cddf44df9adbef677af0d1f1c579c6) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/cpu.h | 7 ++++ + target/arm/kvm.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + target/arm/kvm32.c | 3 ++ + target/arm/kvm64.c | 3 ++ + target/arm/kvm_arm.h | 38 ++++++++++++++++++++++ + target/arm/machine.c | 7 ++++ + 6 files changed, 150 insertions(+) + +diff --git a/target/arm/cpu.h b/target/arm/cpu.h +index 82dd3cc..fbd8ea0 100644 +--- a/target/arm/cpu.h ++++ b/target/arm/cpu.h +@@ -821,6 +821,13 @@ struct ARMCPU { + /* KVM init features for this CPU */ + uint32_t kvm_init_features[7]; + ++ /* KVM CPU state */ ++ ++ /* KVM virtual time adjustment */ ++ bool kvm_adjvtime; ++ bool kvm_vtime_dirty; ++ uint64_t kvm_vtime; ++ + /* Uniprocessor system with MP extensions */ + bool mp_is_up; + +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 5b82cef..26d7f8b 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -359,6 +359,22 @@ static int compare_u64(const void *a, const void *b) + return 0; + } + ++/* ++ * cpreg_values are sorted in ascending order by KVM register ID ++ * (see kvm_arm_init_cpreg_list). This allows us to cheaply find ++ * the storage for a KVM register by ID with a binary search. ++ */ ++static uint64_t *kvm_arm_get_cpreg_ptr(ARMCPU *cpu, uint64_t regidx) ++{ ++ uint64_t *res; ++ ++ res = bsearch(®idx, cpu->cpreg_indexes, cpu->cpreg_array_len, ++ sizeof(uint64_t), compare_u64); ++ assert(res); ++ ++ return &cpu->cpreg_values[res - cpu->cpreg_indexes]; ++} ++ + /* Initialize the ARMCPU cpreg list according to the kernel's + * definition of what CPU registers it knows about (and throw away + * the previous TCG-created cpreg list). +@@ -512,6 +528,23 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level) + return ok; + } + ++void kvm_arm_cpu_pre_save(ARMCPU *cpu) ++{ ++ /* KVM virtual time adjustment */ ++ if (cpu->kvm_vtime_dirty) { ++ *kvm_arm_get_cpreg_ptr(cpu, KVM_REG_ARM_TIMER_CNT) = cpu->kvm_vtime; ++ } ++} ++ ++void kvm_arm_cpu_post_load(ARMCPU *cpu) ++{ ++ /* KVM virtual time adjustment */ ++ if (cpu->kvm_adjvtime) { ++ cpu->kvm_vtime = *kvm_arm_get_cpreg_ptr(cpu, KVM_REG_ARM_TIMER_CNT); ++ cpu->kvm_vtime_dirty = true; ++ } ++} ++ + void kvm_arm_reset_vcpu(ARMCPU *cpu) + { + int ret; +@@ -579,6 +612,50 @@ int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu) + return 0; + } + ++void kvm_arm_get_virtual_time(CPUState *cs) ++{ ++ ARMCPU *cpu = ARM_CPU(cs); ++ struct kvm_one_reg reg = { ++ .id = KVM_REG_ARM_TIMER_CNT, ++ .addr = (uintptr_t)&cpu->kvm_vtime, ++ }; ++ int ret; ++ ++ if (cpu->kvm_vtime_dirty) { ++ return; ++ } ++ ++ ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); ++ if (ret) { ++ error_report("Failed to get KVM_REG_ARM_TIMER_CNT"); ++ abort(); ++ } ++ ++ cpu->kvm_vtime_dirty = true; ++} ++ ++void kvm_arm_put_virtual_time(CPUState *cs) ++{ ++ ARMCPU *cpu = ARM_CPU(cs); ++ struct kvm_one_reg reg = { ++ .id = KVM_REG_ARM_TIMER_CNT, ++ .addr = (uintptr_t)&cpu->kvm_vtime, ++ }; ++ int ret; ++ ++ if (!cpu->kvm_vtime_dirty) { ++ return; ++ } ++ ++ ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); ++ if (ret) { ++ error_report("Failed to set KVM_REG_ARM_TIMER_CNT"); ++ abort(); ++ } ++ ++ cpu->kvm_vtime_dirty = false; ++} ++ + int kvm_put_vcpu_events(ARMCPU *cpu) + { + CPUARMState *env = &cpu->env; +@@ -690,6 +767,21 @@ MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run) + return MEMTXATTRS_UNSPECIFIED; + } + ++void kvm_arm_vm_state_change(void *opaque, int running, RunState state) ++{ ++ CPUState *cs = opaque; ++ ARMCPU *cpu = ARM_CPU(cs); ++ ++ if (running) { ++ if (cpu->kvm_adjvtime) { ++ kvm_arm_put_virtual_time(cs); ++ } ++ } else { ++ if (cpu->kvm_adjvtime) { ++ kvm_arm_get_virtual_time(cs); ++ } ++ } ++} + + int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) + { +diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c +index 32bf8d6..3a8b437 100644 +--- a/target/arm/kvm32.c ++++ b/target/arm/kvm32.c +@@ -16,6 +16,7 @@ + #include "qemu-common.h" + #include "cpu.h" + #include "qemu/timer.h" ++#include "sysemu/runstate.h" + #include "sysemu/kvm.h" + #include "kvm_arm.h" + #include "internals.h" +@@ -198,6 +199,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + return -EINVAL; + } + ++ qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs); ++ + /* Determine init features for this CPU */ + memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); + if (cpu->start_powered_off) { +diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c +index 666a81a..d368189 100644 +--- a/target/arm/kvm64.c ++++ b/target/arm/kvm64.c +@@ -23,6 +23,7 @@ + #include "qemu/host-utils.h" + #include "qemu/main-loop.h" + #include "exec/gdbstub.h" ++#include "sysemu/runstate.h" + #include "sysemu/kvm.h" + #include "sysemu/kvm_int.h" + #include "kvm_arm.h" +@@ -735,6 +736,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + return -EINVAL; + } + ++ qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs); ++ + /* Determine init features for this CPU */ + memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); + if (cpu->start_powered_off) { +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index b48a9c9..01a9a18 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -128,6 +128,23 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level); + bool write_kvmstate_to_list(ARMCPU *cpu); + + /** ++ * kvm_arm_cpu_pre_save: ++ * @cpu: ARMCPU ++ * ++ * Called after write_kvmstate_to_list() from cpu_pre_save() to update ++ * the cpreg list with KVM CPU state. ++ */ ++void kvm_arm_cpu_pre_save(ARMCPU *cpu); ++ ++/** ++ * kvm_arm_cpu_post_load: ++ * @cpu: ARMCPU ++ * ++ * Called from cpu_post_load() to update KVM CPU state from the cpreg list. ++ */ ++void kvm_arm_cpu_post_load(ARMCPU *cpu); ++ ++/** + * kvm_arm_reset_vcpu: + * @cpu: ARMCPU + * +@@ -292,6 +309,24 @@ int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu); + */ + int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu); + ++/** ++ * kvm_arm_get_virtual_time: ++ * @cs: CPUState ++ * ++ * Gets the VCPU's virtual counter and stores it in the KVM CPU state. ++ */ ++void kvm_arm_get_virtual_time(CPUState *cs); ++ ++/** ++ * kvm_arm_put_virtual_time: ++ * @cs: CPUState ++ * ++ * Sets the VCPU's virtual counter to the value stored in the KVM CPU state. ++ */ ++void kvm_arm_put_virtual_time(CPUState *cs); ++ ++void kvm_arm_vm_state_change(void *opaque, int running, RunState state); ++ + int kvm_arm_vgic_probe(void); + + void kvm_arm_pmu_set_irq(CPUState *cs, int irq); +@@ -339,6 +374,9 @@ static inline void kvm_arm_pmu_set_irq(CPUState *cs, int irq) {} + static inline void kvm_arm_pmu_init(CPUState *cs) {} + + static inline void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map) {} ++ ++static inline void kvm_arm_get_virtual_time(CPUState *cs) {} ++static inline void kvm_arm_put_virtual_time(CPUState *cs) {} + #endif + + static inline const char *gic_class_name(void) +diff --git a/target/arm/machine.c b/target/arm/machine.c +index eb28b23..241890a 100644 +--- a/target/arm/machine.c ++++ b/target/arm/machine.c +@@ -642,6 +642,12 @@ static int cpu_pre_save(void *opaque) + /* This should never fail */ + abort(); + } ++ ++ /* ++ * kvm_arm_cpu_pre_save() must be called after ++ * write_kvmstate_to_list() ++ */ ++ kvm_arm_cpu_pre_save(cpu); + } else { + if (!write_cpustate_to_list(cpu, false)) { + /* This should never fail. */ +@@ -744,6 +750,7 @@ static int cpu_post_load(void *opaque, int version_id) + * we're using it. + */ + write_list_to_cpustate(cpu); ++ kvm_arm_cpu_post_load(cpu); + } else { + if (!write_list_to_cpustate(cpu)) { + return -1; +-- +1.8.3.1 + diff --git a/kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch b/kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch new file mode 100755 index 0000000..8cdc867 --- /dev/null +++ b/kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch @@ -0,0 +1,197 @@ +From 11cb9cb7b1b56d5c9723e9c50bc2903281893bcc Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:10 +0000 +Subject: [PATCH 08/15] target/arm/kvm: trivial: Clean up header documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-2-drjones@redhat.com> +Patchwork-id: 93625 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/5] target/arm/kvm: trivial: Clean up header documentation +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:05 +0000 + + target/arm/kvm: trivial: Clean up header documentation + + Signed-off-by: Andrew Jones + Message-id: 20200120101023.16030-2-drjones@redhat.com + Reviewed-by: Peter Maydell + Signed-off-by: Peter Maydell + +(cherry picked from commit d1ebbc9d16297b54b153ee33abe05eb4f1df0c66) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/kvm_arm.h | 46 +++++++++++++++++++++++++++------------------- + 1 file changed, 27 insertions(+), 19 deletions(-) + +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index 8e14d40..b48a9c9 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -28,9 +28,9 @@ + int kvm_arm_vcpu_init(CPUState *cs); + + /** +- * kvm_arm_vcpu_finalize ++ * kvm_arm_vcpu_finalize: + * @cs: CPUState +- * @feature: int ++ * @feature: feature to finalize + * + * Finalizes the configuration of the specified VCPU feature by + * invoking the KVM_ARM_VCPU_FINALIZE ioctl. Features requiring +@@ -75,8 +75,8 @@ void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group, + int kvm_arm_init_cpreg_list(ARMCPU *cpu); + + /** +- * kvm_arm_reg_syncs_via_cpreg_list +- * regidx: KVM register index ++ * kvm_arm_reg_syncs_via_cpreg_list: ++ * @regidx: KVM register index + * + * Return true if this KVM register should be synchronized via the + * cpreg list of arbitrary system registers, false if it is synchronized +@@ -85,8 +85,8 @@ int kvm_arm_init_cpreg_list(ARMCPU *cpu); + bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx); + + /** +- * kvm_arm_cpreg_level +- * regidx: KVM register index ++ * kvm_arm_cpreg_level: ++ * @regidx: KVM register index + * + * Return the level of this coprocessor/system register. Return value is + * either KVM_PUT_RUNTIME_STATE, KVM_PUT_RESET_STATE, or KVM_PUT_FULL_STATE. +@@ -148,6 +148,8 @@ void kvm_arm_init_serror_injection(CPUState *cs); + * @cpu: ARMCPU + * + * Get VCPU related state from kvm. ++ * ++ * Returns: 0 if success else < 0 error code + */ + int kvm_get_vcpu_events(ARMCPU *cpu); + +@@ -156,6 +158,8 @@ int kvm_get_vcpu_events(ARMCPU *cpu); + * @cpu: ARMCPU + * + * Put VCPU related state to kvm. ++ * ++ * Returns: 0 if success else < 0 error code + */ + int kvm_put_vcpu_events(ARMCPU *cpu); + +@@ -205,10 +209,12 @@ typedef struct ARMHostCPUFeatures { + + /** + * kvm_arm_get_host_cpu_features: +- * @ahcc: ARMHostCPUClass to fill in ++ * @ahcf: ARMHostCPUClass to fill in + * + * Probe the capabilities of the host kernel's preferred CPU and fill + * in the ARMHostCPUClass struct accordingly. ++ * ++ * Returns true on success and false otherwise. + */ + bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf); + +@@ -242,7 +248,7 @@ void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu); + bool kvm_arm_aarch32_supported(CPUState *cs); + + /** +- * bool kvm_arm_pmu_supported: ++ * kvm_arm_pmu_supported: + * @cs: CPUState + * + * Returns: true if the KVM VCPU can enable its PMU +@@ -251,7 +257,7 @@ bool kvm_arm_aarch32_supported(CPUState *cs); + bool kvm_arm_pmu_supported(CPUState *cs); + + /** +- * bool kvm_arm_sve_supported: ++ * kvm_arm_sve_supported: + * @cs: CPUState + * + * Returns true if the KVM VCPU can enable SVE and false otherwise. +@@ -259,26 +265,30 @@ bool kvm_arm_pmu_supported(CPUState *cs); + bool kvm_arm_sve_supported(CPUState *cs); + + /** +- * kvm_arm_get_max_vm_ipa_size - Returns the number of bits in the +- * IPA address space supported by KVM +- * ++ * kvm_arm_get_max_vm_ipa_size: + * @ms: Machine state handle ++ * ++ * Returns the number of bits in the IPA address space supported by KVM + */ + int kvm_arm_get_max_vm_ipa_size(MachineState *ms); + + /** +- * kvm_arm_sync_mpstate_to_kvm ++ * kvm_arm_sync_mpstate_to_kvm: + * @cpu: ARMCPU + * + * If supported set the KVM MP_STATE based on QEMU's model. ++ * ++ * Returns 0 on success and -1 on failure. + */ + int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu); + + /** +- * kvm_arm_sync_mpstate_to_qemu ++ * kvm_arm_sync_mpstate_to_qemu: + * @cpu: ARMCPU + * + * If supported get the MP_STATE from KVM and store in QEMU's model. ++ * ++ * Returns 0 on success and aborts on failure. + */ + int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu); + +@@ -292,7 +302,8 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); + + static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) + { +- /* This should never actually be called in the "not KVM" case, ++ /* ++ * This should never actually be called in the "not KVM" case, + * but set up the fields to indicate an error anyway. + */ + cpu->kvm_target = QEMU_KVM_ARM_TARGET_NONE; +@@ -377,23 +388,20 @@ bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit); + * + * Return: TRUE if any hardware breakpoints in use. + */ +- + bool kvm_arm_hw_debug_active(CPUState *cs); + + /** + * kvm_arm_copy_hw_debug_data: +- * + * @ptr: kvm_guest_debug_arch structure + * + * Copy the architecture specific debug registers into the + * kvm_guest_debug ioctl structure. + */ + struct kvm_guest_debug_arch; +- + void kvm_arm_copy_hw_debug_data(struct kvm_guest_debug_arch *ptr); + + /** +- * its_class_name ++ * its_class_name: + * + * Return the ITS class name to use depending on whether KVM acceleration + * and KVM CAP_SIGNAL_MSI are supported +-- +1.8.3.1 + diff --git a/kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch b/kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch new file mode 100755 index 0000000..36c0f1a --- /dev/null +++ b/kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch @@ -0,0 +1,60 @@ +From 2740a84fe798ade5c1ce725d65cdaffb255da47c Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:11 +0000 +Subject: [PATCH 09/15] target/arm/kvm64: kvm64 cpus have timer registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-3-drjones@redhat.com> +Patchwork-id: 93621 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/5] target/arm/kvm64: kvm64 cpus have timer registers +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + target/arm/kvm64: kvm64 cpus have timer registers + + Add the missing GENERIC_TIMER feature to kvm64 cpus. + + We don't currently use these registers when KVM is enabled, but it's + probably best we add the feature flag for consistency and potential + future use. There's also precedent, as we add the PMU feature flag to + KVM enabled guests, even though we don't use those registers either. + + This change was originally posted as a hunk of a different, never + merged patch from Bijan Mottahedeh. + + Signed-off-by: Andrew Jones + Reviewed-by: Richard Henderson + Message-id: 20200120101023.16030-4-drjones@redhat.com + Signed-off-by: Peter Maydell + +(cherry picked from commit 65caa415487f4a6e265105446c6ef8f56bb0aa70) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/kvm64.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c +index e2da756..666a81a 100644 +--- a/target/arm/kvm64.c ++++ b/target/arm/kvm64.c +@@ -605,6 +605,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) + set_feature(&features, ARM_FEATURE_NEON); + set_feature(&features, ARM_FEATURE_AARCH64); + set_feature(&features, ARM_FEATURE_PMU); ++ set_feature(&features, ARM_FEATURE_GENERIC_TIMER); + + ahcf->features = features; + +-- +1.8.3.1 + diff --git a/kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch b/kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch new file mode 100755 index 0000000..55f328d --- /dev/null +++ b/kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch @@ -0,0 +1,81 @@ +From c82cf5c08617c947b34eb490d1714729103e3379 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Mon, 10 Feb 2020 17:33:57 +0000 +Subject: [PATCH 17/18] target/arm/monitor: query-cpu-model-expansion crashed + qemu when using machine type none +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200210173358.16896-2-drjones@redhat.com> +Patchwork-id: 93773 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] target/arm/monitor: query-cpu-model-expansion crashed qemu when using machine type none +Bugzilla: 1801320 +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan +RH-Acked-by: Philippe Mathieu-Daudé + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1801320 + +Author: Liang Yan +Date: Fri, 07 Feb 2020 14:04:21 +0000 + + target/arm/monitor: query-cpu-model-expansion crashed qemu when using machine type none + + Commit e19afd566781 mentioned that target-arm only supports queryable + cpu models 'max', 'host', and the current type when KVM is in use. + The logic works well until using machine type none. + + For machine type none, cpu_type will be null if cpu option is not + set by command line, strlen(cpu_type) will terminate process. + So We add a check above it. + + This won't affect i386 and s390x since they do not use current_cpu. + + Signed-off-by: Liang Yan + Message-id: 20200203134251.12986-1-lyan@suse.com + Reviewed-by: Andrew Jones + Tested-by: Andrew Jones + Signed-off-by: Peter Maydell + +(cherry picked from commit 0999a4ba8718aa96105b978d3567fc7e90244c7e) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/monitor.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +diff --git a/target/arm/monitor.c b/target/arm/monitor.c +index 9725dff..c2dc790 100644 +--- a/target/arm/monitor.c ++++ b/target/arm/monitor.c +@@ -137,17 +137,20 @@ CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type, + } + + if (kvm_enabled()) { +- const char *cpu_type = current_machine->cpu_type; +- int len = strlen(cpu_type) - strlen(ARM_CPU_TYPE_SUFFIX); + bool supported = false; + + if (!strcmp(model->name, "host") || !strcmp(model->name, "max")) { + /* These are kvmarm's recommended cpu types */ + supported = true; +- } else if (strlen(model->name) == len && +- !strncmp(model->name, cpu_type, len)) { +- /* KVM is enabled and we're using this type, so it works. */ +- supported = true; ++ } else if (current_machine->cpu_type) { ++ const char *cpu_type = current_machine->cpu_type; ++ int len = strlen(cpu_type) - strlen(ARM_CPU_TYPE_SUFFIX); ++ ++ if (strlen(model->name) == len && ++ !strncmp(model->name, cpu_type, len)) { ++ /* KVM is enabled and we're using this type, so it works. */ ++ supported = true; ++ } + } + if (!supported) { + error_setg(errp, "We cannot guarantee the CPU type '%s' works " +-- +1.8.3.1 + diff --git a/kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch b/kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch new file mode 100755 index 0000000..ffb6ab7 --- /dev/null +++ b/kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch @@ -0,0 +1,83 @@ +From 4c9201a83e3ff48d2a55e45a34eb27966a1e4ab0 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 5 Jun 2020 18:37:33 -0400 +Subject: [PATCH 3/3] target/i386: Add ARCH_CAPABILITIES related bits into + Icelake-Server CPU model +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: plai@redhat.com +Message-id: <20200605183733.8269-1-plai@redhat.com> +Patchwork-id: 97380 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH] target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model +Bugzilla: 1840342 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Bandan Das +RH-Acked-by: Danilo de Paula +RH-Acked-by: Eduardo Habkost + +From: Xiaoyao Li + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1840342 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=28983822 +Branch: rhel-av-8.2.1 + +Tested on HOST: intel-whitley-09.khw1.lab.eng.bos.redhat.com + +1. qemu-kvm -cpu host … + VM guest does have arch_capabilities in cpuinfo/flags. + [Expected success] + +2. qemu-kvm -cpu Icelake-Server … + VM guest does NOT have arch_capabilities in cpuinfo/flags. + [Expected failure] + +3. qemu-kvm -cpu Icelake-Server-v3 … + VM guest does have arch_capabilities in cpuinfo/flags. + [Expected success] + +--- + +Current Icelake-Server CPU model lacks all the features enumerated by +MSR_IA32_ARCH_CAPABILITIES. + +Add them, so that guest of "Icelake-Server" can see all of them. + +Signed-off-by: Xiaoyao Li +Message-Id: <20200316095605.12318-1-xiaoyao.li@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit d965dc35592d24c0c1519f1c566223c6277cb80e) +Signed-off-by: Paul Lai +Signed-off-by: Eduardo Lima (Etrunko) +--- + target/i386/cpu.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index b763adcdc5..7d7b016bb7 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3496,6 +3496,19 @@ static X86CPUDefinition builtin_x86_defs[] = { + { /* end of list */ } + }, + }, ++ { ++ .version = 3, ++ .props = (PropValue[]) { ++ { "arch-capabilities", "on" }, ++ { "rdctl-no", "on" }, ++ { "ibrs-all", "on" }, ++ { "skip-l1dfl-vmentry", "on" }, ++ { "mds-no", "on" }, ++ { "pschange-mc-no", "on" }, ++ { "taa-no", "on" }, ++ { /* end of list */ } ++ }, ++ }, + { /* end of list */ } + } + }, +-- +2.27.0 + diff --git a/kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch b/kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch new file mode 100755 index 0000000..ef95ccf --- /dev/null +++ b/kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch @@ -0,0 +1,103 @@ +From 1ffeb321151b3878bcbb2229639456c0677305f5 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:43 +0100 +Subject: [PATCH 17/17] target/i386: Add missed features to Cooperlake CPU + model + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-5-plai@redhat.com> +Patchwork-id: 96611 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 4/4] target/i386: Add missed features to Cooperlake CPU model +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Xiaoyao Li + +It lacks VMX features and two security feature bits (disclosed recently) in +MSR_IA32_ARCH_CAPABILITIES in current Cooperlake CPU model, so add them. + +Fixes: 22a866b6166d ("i386: Add new CPU model Cooperlake") +Signed-off-by: Xiaoyao Li +Message-Id: <20191225063018.20038-3-xiaoyao.li@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 2dea9d9ca4ea7e9afe83d0b4153b21a16987e866) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 50 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 996a74f..b763adc 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3202,7 +3202,8 @@ static X86CPUDefinition builtin_x86_defs[] = { + CPUID_7_0_EDX_SPEC_CTRL_SSBD | CPUID_7_0_EDX_ARCH_CAPABILITIES, + .features[FEAT_ARCH_CAPABILITIES] = + MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | +- MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO, ++ MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO | ++ MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_TAA_NO, + .features[FEAT_7_1_EAX] = + CPUID_7_1_EAX_AVX512_BF16, + /* +@@ -3217,6 +3218,54 @@ static X86CPUDefinition builtin_x86_defs[] = { + CPUID_XSAVE_XGETBV1, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, ++ /* Missing: Mode-based execute control (XS/XU), processor tracing, TSC scaling */ ++ .features[FEAT_VMX_BASIC] = MSR_VMX_BASIC_INS_OUTS | ++ MSR_VMX_BASIC_TRUE_CTLS, ++ .features[FEAT_VMX_ENTRY_CTLS] = VMX_VM_ENTRY_IA32E_MODE | ++ VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VMX_VM_ENTRY_LOAD_IA32_PAT | ++ VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_LOAD_IA32_EFER, ++ .features[FEAT_VMX_EPT_VPID_CAPS] = MSR_VMX_EPT_EXECONLY | ++ MSR_VMX_EPT_PAGE_WALK_LENGTH_4 | MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | ++ MSR_VMX_EPT_1GB | MSR_VMX_EPT_INVEPT | ++ MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT | ++ MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR | ++ MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT | MSR_VMX_EPT_INVVPID_ALL_CONTEXT | ++ MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS | MSR_VMX_EPT_AD_BITS, ++ .features[FEAT_VMX_EXIT_CTLS] = ++ VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_DEBUG_CONTROLS | ++ VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | ++ VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_LOAD_IA32_EFER | ++ VMX_VM_EXIT_SAVE_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER | ++ VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, ++ .features[FEAT_VMX_MISC] = MSR_VMX_MISC_ACTIVITY_HLT | ++ MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_VMWRITE_VMEXIT, ++ .features[FEAT_VMX_PINBASED_CTLS] = VMX_PIN_BASED_EXT_INTR_MASK | ++ VMX_PIN_BASED_NMI_EXITING | VMX_PIN_BASED_VIRTUAL_NMIS | ++ VMX_PIN_BASED_VMX_PREEMPTION_TIMER | VMX_PIN_BASED_POSTED_INTR, ++ .features[FEAT_VMX_PROCBASED_CTLS] = VMX_CPU_BASED_VIRTUAL_INTR_PENDING | ++ VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING | ++ VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING | ++ VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING | ++ VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING | ++ VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_MOV_DR_EXITING | ++ VMX_CPU_BASED_UNCOND_IO_EXITING | VMX_CPU_BASED_USE_IO_BITMAPS | ++ VMX_CPU_BASED_MONITOR_EXITING | VMX_CPU_BASED_PAUSE_EXITING | ++ VMX_CPU_BASED_VIRTUAL_NMI_PENDING | VMX_CPU_BASED_USE_MSR_BITMAPS | ++ VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING | ++ VMX_CPU_BASED_MONITOR_TRAP_FLAG | ++ VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, ++ .features[FEAT_VMX_SECONDARY_CTLS] = ++ VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | ++ VMX_SECONDARY_EXEC_WBINVD_EXITING | VMX_SECONDARY_EXEC_ENABLE_EPT | ++ VMX_SECONDARY_EXEC_DESC | VMX_SECONDARY_EXEC_RDTSCP | ++ VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | ++ VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST | ++ VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT | ++ VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | ++ VMX_SECONDARY_EXEC_RDRAND_EXITING | VMX_SECONDARY_EXEC_ENABLE_INVPCID | ++ VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS | ++ VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML, ++ .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, + .xlevel = 0x80000008, + .model_id = "Intel Xeon Processor (Cooperlake)", + }, +-- +1.8.3.1 + diff --git a/kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch b/kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch new file mode 100755 index 0000000..ad2dd77 --- /dev/null +++ b/kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch @@ -0,0 +1,62 @@ +From 6f0630299a3edbb8f5e5ac41eb9e1f1c363f1e3e Mon Sep 17 00:00:00 2001 +From: Danilo de Paula +Date: Tue, 9 Jun 2020 18:46:51 +0100 +Subject: [PATCH 15/17] target/i386: Add new bit definitions of + MSR_IA32_ARCH_CAPABILITIES + +RH-Author: Danilo de Paula +Message-id: <20200609184651.1328372-1-ddepaula@redhat.com> +Patchwork-id: 97489 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 5/4] target/i386: Add new bit definitions of MSR_IA32_ARCH_CAPABILITIES +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Eduardo Habkost + +From: Danilo de Paula + +redhat: builds with that series were failing. It complains about a undefined +MSR_ARCH_CAP_TAA_NO. + +The bit 6, 7 and 8 of MSR_IA32_ARCH_CAPABILITIES are recently disclosed +for some security issues. Add the definitions for them to be used by named +CPU models. + +Signed-off-by: Xiaoyao Li +Message-Id: <20191225063018.20038-2-xiaoyao.li@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 6c997b4adb300788d61d72e2b8bc67c03a584956) + +Signed-off-by: Paolo Bonzini +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.h | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index e77d101..7bfbf2a 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -836,12 +836,15 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_TOPOLOGY_LEVEL_DIE (5U << 8) + + /* MSR Feature Bits */ +-#define MSR_ARCH_CAP_RDCL_NO (1U << 0) +-#define MSR_ARCH_CAP_IBRS_ALL (1U << 1) +-#define MSR_ARCH_CAP_RSBA (1U << 2) ++#define MSR_ARCH_CAP_RDCL_NO (1U << 0) ++#define MSR_ARCH_CAP_IBRS_ALL (1U << 1) ++#define MSR_ARCH_CAP_RSBA (1U << 2) + #define MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY (1U << 3) +-#define MSR_ARCH_CAP_SSB_NO (1U << 4) +-#define MSR_ARCH_CAP_MDS_NO (1U << 5) ++#define MSR_ARCH_CAP_SSB_NO (1U << 4) ++#define MSR_ARCH_CAP_MDS_NO (1U << 5) ++#define MSR_ARCH_CAP_PSCHANGE_MC_NO (1U << 6) ++#define MSR_ARCH_CAP_TSX_CTRL_MSR (1U << 7) ++#define MSR_ARCH_CAP_TAA_NO (1U << 8) + + #define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5) + +-- +1.8.3.1 + diff --git a/kvm-target-i386-add-a-ucode-rev-property.patch b/kvm-target-i386-add-a-ucode-rev-property.patch new file mode 100755 index 0000000..5c3c770 --- /dev/null +++ b/kvm-target-i386-add-a-ucode-rev-property.patch @@ -0,0 +1,125 @@ +From 4009f0bcc8004ce481015d088fe335a16b8d7ce1 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:12 +0000 +Subject: [PATCH 2/9] target/i386: add a ucode-rev property + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-3-pbonzini@redhat.com> +Patchwork-id: 93909 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/6] target/i386: add a ucode-rev property +Bugzilla: 1791648 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +Add the property and plumb it in TCG and HVF (the latter of which +tried to support returning a constant value but used the wrong MSR). + +Signed-off-by: Paolo Bonzini +Message-Id: <1579544504-3616-3-git-send-email-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 4e45aff398cd1542c2a384a2a3b8600f23337d86) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 10 ++++++++++ + target/i386/cpu.h | 3 +++ + target/i386/hvf/x86_emu.c | 4 +--- + target/i386/misc_helper.c | 4 ++++ + 4 files changed, 18 insertions(+), 3 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 863192c..e505d3e 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6325,6 +6325,15 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + } + } + ++ if (cpu->ucode_rev == 0) { ++ /* The default is the same as KVM's. */ ++ if (IS_AMD_CPU(env)) { ++ cpu->ucode_rev = 0x01000065; ++ } else { ++ cpu->ucode_rev = 0x100000000ULL; ++ } ++ } ++ + /* mwait extended info: needed for Core compatibility */ + /* We always wake on interrupt even if host does not have the capability */ + cpu->mwait.ecx |= CPUID_MWAIT_EMX | CPUID_MWAIT_IBE; +@@ -7008,6 +7017,7 @@ static Property x86_cpu_properties[] = { + DEFINE_PROP_UINT32("min-level", X86CPU, env.cpuid_min_level, 0), + DEFINE_PROP_UINT32("min-xlevel", X86CPU, env.cpuid_min_xlevel, 0), + DEFINE_PROP_UINT32("min-xlevel2", X86CPU, env.cpuid_min_xlevel2, 0), ++ DEFINE_PROP_UINT64("ucode-rev", X86CPU, ucode_rev, 0), + DEFINE_PROP_BOOL("full-cpuid-auto-level", X86CPU, full_cpuid_auto_level, true), + DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor_id), + DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true), +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index cde2a16..4441061 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -348,6 +348,7 @@ typedef enum X86Seg { + #define MSR_IA32_SPEC_CTRL 0x48 + #define MSR_VIRT_SSBD 0xc001011f + #define MSR_IA32_PRED_CMD 0x49 ++#define MSR_IA32_UCODE_REV 0x8b + #define MSR_IA32_CORE_CAPABILITY 0xcf + + #define MSR_IA32_ARCH_CAPABILITIES 0x10a +@@ -1621,6 +1622,8 @@ struct X86CPU { + CPUNegativeOffsetState neg; + CPUX86State env; + ++ uint64_t ucode_rev; ++ + uint32_t hyperv_spinlock_attempts; + char *hyperv_vendor_id; + bool hyperv_synic_kvm_only; +diff --git a/target/i386/hvf/x86_emu.c b/target/i386/hvf/x86_emu.c +index 3df7672..92ab815 100644 +--- a/target/i386/hvf/x86_emu.c ++++ b/target/i386/hvf/x86_emu.c +@@ -664,8 +664,6 @@ static void exec_lods(struct CPUX86State *env, struct x86_decode *decode) + RIP(env) += decode->len; + } + +-#define MSR_IA32_UCODE_REV 0x00000017 +- + void simulate_rdmsr(struct CPUState *cpu) + { + X86CPU *x86_cpu = X86_CPU(cpu); +@@ -681,7 +679,7 @@ void simulate_rdmsr(struct CPUState *cpu) + val = cpu_get_apic_base(X86_CPU(cpu)->apic_state); + break; + case MSR_IA32_UCODE_REV: +- val = (0x100000000ULL << 32) | 0x100000000ULL; ++ val = x86_cpu->ucode_rev; + break; + case MSR_EFER: + val = rvmcs(cpu->hvf_fd, VMCS_GUEST_IA32_EFER); +diff --git a/target/i386/misc_helper.c b/target/i386/misc_helper.c +index 3eff688..aed16fe 100644 +--- a/target/i386/misc_helper.c ++++ b/target/i386/misc_helper.c +@@ -229,6 +229,7 @@ void helper_rdmsr(CPUX86State *env) + #else + void helper_wrmsr(CPUX86State *env) + { ++ X86CPU *x86_cpu = env_archcpu(env); + uint64_t val; + + cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1, GETPC()); +@@ -371,6 +372,9 @@ void helper_wrmsr(CPUX86State *env) + env->msr_bndcfgs = val; + cpu_sync_bndcs_hflags(env); + break; ++ case MSR_IA32_UCODE_REV: ++ val = x86_cpu->ucode_rev; ++ break; + default: + if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL + && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL + +-- +1.8.3.1 + diff --git a/kvm-target-i386-add-fast-short-REP-MOV-support.patch b/kvm-target-i386-add-fast-short-REP-MOV-support.patch new file mode 100755 index 0000000..51af7e7 --- /dev/null +++ b/kvm-target-i386-add-fast-short-REP-MOV-support.patch @@ -0,0 +1,59 @@ +From f33880c5f7a4e2cad25c22112da073273c6e2cfb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:35 -0500 +Subject: [PATCH 2/4] target/i386: add fast short REP MOV support + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-3-dgilbert@redhat.com> +Patchwork-id: 101201 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/4] target/i386: add fast short REP MOV support +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Chenyi Qiang + +For CPUs support fast short REP MOV[CPUID.(EAX=7,ECX=0):EDX(bit4)], e.g +Icelake and Tigerlake, expose it to the guest VM. + +Reviewed-by: Eduardo Habkost +Signed-off-by: Chenyi Qiang +Message-Id: <20200714084148.26690-2-chenyi.qiang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 5cb287d2bd578dfe4897458793b4fce35bc4f744) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 2 +- + target/i386/cpu.h | 2 ++ + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 67dab94aa5..f6a9ed84b3 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1077,7 +1077,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + NULL, NULL, "avx512-4vnniw", "avx512-4fmaps", +- NULL, NULL, NULL, NULL, ++ "fsrm", NULL, NULL, NULL, + "avx512-vp2intersect", NULL, "md-clear", NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL /* pconfig */, NULL, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 8e2e52ed31..f5a4efcec6 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -770,6 +770,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2) + /* AVX512 Multiply Accumulation Single Precision */ + #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) ++/* Fast Short Rep Mov */ ++#define CPUID_7_0_EDX_FSRM (1U << 4) + /* AVX512 Vector Pair Intersection to a Pair of Mask Registers */ + #define CPUID_7_0_EDX_AVX512_VP2INTERSECT (1U << 8) + /* Speculation Control */ +-- +2.27.0 + diff --git a/kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch b/kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch new file mode 100755 index 0000000..a80c9d3 --- /dev/null +++ b/kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch @@ -0,0 +1,72 @@ +From 27d7b085f2f568050d638b694ed2f51495db718c Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:15 +0000 +Subject: [PATCH 5/9] target/i386: check for availability of MSR_IA32_UCODE_REV + as an emulated MSR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-6-pbonzini@redhat.com> +Patchwork-id: 93898 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/6] target/i386: check for availability of MSR_IA32_UCODE_REV as an emulated MSR +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +Even though MSR_IA32_UCODE_REV has been available long before Linux 5.6, +which added it to the emulated MSR list, a bug caused the microcode +version to revert to 0x100000000 on INIT. As a result, processors other +than the bootstrap processor would not see the host microcode revision; +some Windows version complain loudly about this and crash with a +fairly explicit MICROCODE REVISION MISMATCH error. + +[If running 5.6 prereleases, the kernel fix "KVM: x86: do not reset + microcode version on INIT or RESET" should also be applied.] + +Reported-by: Alex Williamson +Message-id: <20200211175516.10716-1-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 6702514814c7e7b4cbf179624539b5f38c72740b) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/kvm.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 6c61aef..99840ca 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -105,6 +105,7 @@ static bool has_msr_smi_count; + static bool has_msr_arch_capabs; + static bool has_msr_core_capabs; + static bool has_msr_vmx_vmfunc; ++static bool has_msr_ucode_rev; + + static uint32_t has_architectural_pmu_version; + static uint32_t num_architectural_pmu_gp_counters; +@@ -2056,6 +2057,9 @@ static int kvm_get_supported_msrs(KVMState *s) + case MSR_IA32_VMX_VMFUNC: + has_msr_vmx_vmfunc = true; + break; ++ case MSR_IA32_UCODE_REV: ++ has_msr_ucode_rev = true; ++ break; + } + } + } +@@ -2696,8 +2700,7 @@ static void kvm_init_msrs(X86CPU *cpu) + env->features[FEAT_CORE_CAPABILITY]); + } + +- if (kvm_arch_get_supported_msr_feature(kvm_state, +- MSR_IA32_UCODE_REV)) { ++ if (has_msr_ucode_rev) { + kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); + } + +-- +1.8.3.1 + diff --git a/kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch b/kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch new file mode 100755 index 0000000..4c2362d --- /dev/null +++ b/kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch @@ -0,0 +1,112 @@ +From 77cdcccc49ba988e3b5bcb66decdee2e99fdcd72 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Tue, 14 Apr 2020 15:00:36 +0100 +Subject: [PATCH] target/i386: do not set unsupported VMX secondary execution + controls + +RH-Author: Vitaly Kuznetsov +Message-id: <20200414150036.625732-2-vkuznets@redhat.com> +Patchwork-id: 94674 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] target/i386: do not set unsupported VMX secondary execution controls +Bugzilla: 1822682 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Paolo Bonzini + +Commit 048c95163b4 ("target/i386: work around KVM_GET_MSRS bug for +secondary execution controls") added a workaround for KVM pre-dating +commit 6defc591846d ("KVM: nVMX: include conditional controls in /dev/kvm +KVM_GET_MSRS") which wasn't setting certain available controls. The +workaround uses generic CPUID feature bits to set missing VMX controls. + +It was found that in some cases it is possible to observe hosts which +have certain CPUID features but lack the corresponding VMX control. + +In particular, it was reported that Azure VMs have RDSEED but lack +VMX_SECONDARY_EXEC_RDSEED_EXITING; attempts to enable this feature +bit result in QEMU abort. + +Resolve the issue but not applying the workaround when we don't have +to. As there is no good way to find out if KVM has the fix itself, use +95c5c7c77c ("KVM: nVMX: list VMX MSRs in KVM_GET_MSR_INDEX_LIST") instead +as these [are supposed to] come together. + +Fixes: 048c95163b4 ("target/i386: work around KVM_GET_MSRS bug for secondary execution controls") +Suggested-by: Paolo Bonzini +Signed-off-by: Vitaly Kuznetsov +Message-Id: <20200331162752.1209928-1-vkuznets@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 4a910e1f6ab4155ec8b24c49b2585cc486916985) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/kvm.c | 41 ++++++++++++++++++++++++++--------------- + 1 file changed, 26 insertions(+), 15 deletions(-) + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 99840ca..fcc8f7d 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -106,6 +106,7 @@ static bool has_msr_arch_capabs; + static bool has_msr_core_capabs; + static bool has_msr_vmx_vmfunc; + static bool has_msr_ucode_rev; ++static bool has_msr_vmx_procbased_ctls2; + + static uint32_t has_architectural_pmu_version; + static uint32_t num_architectural_pmu_gp_counters; +@@ -490,21 +491,28 @@ uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index) + value = msr_data.entries[0].data; + switch (index) { + case MSR_IA32_VMX_PROCBASED_CTLS2: +- /* KVM forgot to add these bits for some time, do this ourselves. */ +- if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & CPUID_XSAVE_XSAVES) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & CPUID_EXT_RDRAND) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & CPUID_7_0_EBX_INVPCID) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & CPUID_7_0_EBX_RDSEED) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & CPUID_EXT2_RDTSCP) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; ++ if (!has_msr_vmx_procbased_ctls2) { ++ /* KVM forgot to add these bits for some time, do this ourselves. */ ++ if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & ++ CPUID_XSAVE_XSAVES) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & ++ CPUID_EXT_RDRAND) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & ++ CPUID_7_0_EBX_INVPCID) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & ++ CPUID_7_0_EBX_RDSEED) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & ++ CPUID_EXT2_RDTSCP) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; ++ } + } + /* fall through */ + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: +@@ -2060,6 +2068,9 @@ static int kvm_get_supported_msrs(KVMState *s) + case MSR_IA32_UCODE_REV: + has_msr_ucode_rev = true; + break; ++ case MSR_IA32_VMX_PROCBASED_CTLS2: ++ has_msr_vmx_procbased_ctls2 = true; ++ break; + } + } + } +-- +1.8.3.1 + diff --git a/kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch b/kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch new file mode 100755 index 0000000..47438a3 --- /dev/null +++ b/kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch @@ -0,0 +1,49 @@ +From 7b71a7011437ebfa3bc7df9297e892b82293ec98 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:16 +0000 +Subject: [PATCH 6/9] target/i386: enable monitor and ucode revision with -cpu + max +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-7-pbonzini@redhat.com> +Patchwork-id: 93910 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 6/6] target/i386: enable monitor and ucode revision with -cpu max +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +These two features were incorrectly tied to host_cpuid_required rather than +cpu->max_features. As a result, -cpu max was not enabling either MONITOR +features or ucode revision. + +Signed-off-by: Paolo Bonzini +(cherry picked from commit be02cda3afde60d219786e23c3f8edb53aec8e17) + +[RHEL7: context, upstream uses g_autofree] + +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 5ac843d..1685a8c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6317,7 +6317,9 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + g_free(name); + goto out; + } ++ } + ++ if (cpu->max_features && accel_uses_host_cpuid()) { + if (enable_cpu_pm) { + host_cpuid(5, 0, &cpu->mwait.eax, &cpu->mwait.ebx, + &cpu->mwait.ecx, &cpu->mwait.edx); +-- +1.8.3.1 + diff --git a/kvm-target-i386-fix-TCG-UCODE_REV-access.patch b/kvm-target-i386-fix-TCG-UCODE_REV-access.patch new file mode 100755 index 0000000..c7ced8a --- /dev/null +++ b/kvm-target-i386-fix-TCG-UCODE_REV-access.patch @@ -0,0 +1,73 @@ +From 3d16f05359e6277da1f970f71aa9f76337d655dc Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:14 +0000 +Subject: [PATCH 4/9] target/i386: fix TCG UCODE_REV access +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-5-pbonzini@redhat.com> +Patchwork-id: 93904 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/6] target/i386: fix TCG UCODE_REV access +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +This was a very interesting semantic conflict that caused git to move +the MSR_IA32_UCODE_REV read to helper_wrmsr. Not a big deal, but +still should be fixed... + +Fixes: 4e45aff398 ("target/i386: add a ucode-rev property", 2020-01-24) +Message-id: <20200206171022.9289-1-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 9028c75c9d08be303ccc425bfe3d3b23d8f4cac7) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/misc_helper.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/target/i386/misc_helper.c b/target/i386/misc_helper.c +index aed16fe..7d61221 100644 +--- a/target/i386/misc_helper.c ++++ b/target/i386/misc_helper.c +@@ -229,7 +229,6 @@ void helper_rdmsr(CPUX86State *env) + #else + void helper_wrmsr(CPUX86State *env) + { +- X86CPU *x86_cpu = env_archcpu(env); + uint64_t val; + + cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1, GETPC()); +@@ -372,9 +371,6 @@ void helper_wrmsr(CPUX86State *env) + env->msr_bndcfgs = val; + cpu_sync_bndcs_hflags(env); + break; +- case MSR_IA32_UCODE_REV: +- val = x86_cpu->ucode_rev; +- break; + default: + if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL + && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL + +@@ -393,6 +389,7 @@ void helper_wrmsr(CPUX86State *env) + + void helper_rdmsr(CPUX86State *env) + { ++ X86CPU *x86_cpu = env_archcpu(env); + uint64_t val; + + cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 0, GETPC()); +@@ -526,6 +523,9 @@ void helper_rdmsr(CPUX86State *env) + case MSR_IA32_BNDCFGS: + val = env->msr_bndcfgs; + break; ++ case MSR_IA32_UCODE_REV: ++ val = x86_cpu->ucode_rev; ++ break; + default: + if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL + && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL + +-- +1.8.3.1 + diff --git a/kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch b/kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch new file mode 100755 index 0000000..5118aed --- /dev/null +++ b/kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch @@ -0,0 +1,178 @@ +From eb0fc0ae2750a0462698d6d21ebb56a4249539f9 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:11 +0000 +Subject: [PATCH 1/9] target/i386: kvm: initialize feature MSRs very early +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-2-pbonzini@redhat.com> +Patchwork-id: 93899 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/6] target/i386: kvm: initialize feature MSRs very early +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +Some read-only MSRs affect the behavior of ioctls such as +KVM_SET_NESTED_STATE. We can initialize them once and for all +right after the CPU is realized, since they will never be modified +by the guest. + +Reported-by: Qingua Cheng +Cc: qemu-stable@nongnu.org +Signed-off-by: Paolo Bonzini +Message-Id: <1579544504-3616-2-git-send-email-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 420ae1fc51c99abfd03b1c590f55617edd2a2bed) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/kvm.c | 81 ++++++++++++++++++++++++++++++-------------------- + target/i386/kvm_i386.h | 1 + + 2 files changed, 49 insertions(+), 33 deletions(-) + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 86d9a1f..f41605b 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -67,6 +67,8 @@ + * 255 kvm_msr_entry structs */ + #define MSR_BUF_SIZE 4096 + ++static void kvm_init_msrs(X86CPU *cpu); ++ + const KVMCapabilityInfo kvm_arch_required_capabilities[] = { + KVM_CAP_INFO(SET_TSS_ADDR), + KVM_CAP_INFO(EXT_CPUID), +@@ -1842,6 +1844,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + has_msr_tsc_aux = false; + } + ++ kvm_init_msrs(cpu); ++ + r = hyperv_init_vcpu(cpu); + if (r) { + goto fail; +@@ -2660,11 +2664,53 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) + VMCS12_MAX_FIELD_INDEX << 1); + } + ++static int kvm_buf_set_msrs(X86CPU *cpu) ++{ ++ int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ if (ret < cpu->kvm_msr_buf->nmsrs) { ++ struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; ++ error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, ++ (uint32_t)e->index, (uint64_t)e->data); ++ } ++ ++ assert(ret == cpu->kvm_msr_buf->nmsrs); ++ return 0; ++} ++ ++static void kvm_init_msrs(X86CPU *cpu) ++{ ++ CPUX86State *env = &cpu->env; ++ ++ kvm_msr_buf_reset(cpu); ++ if (has_msr_arch_capabs) { ++ kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, ++ env->features[FEAT_ARCH_CAPABILITIES]); ++ } ++ ++ if (has_msr_core_capabs) { ++ kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, ++ env->features[FEAT_CORE_CAPABILITY]); ++ } ++ ++ /* ++ * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but ++ * all kernels with MSR features should have them. ++ */ ++ if (kvm_feature_msrs && cpu_has_vmx(env)) { ++ kvm_msr_entry_add_vmx(cpu, env->features); ++ } ++ ++ assert(kvm_buf_set_msrs(cpu) == 0); ++} ++ + static int kvm_put_msrs(X86CPU *cpu, int level) + { + CPUX86State *env = &cpu->env; + int i; +- int ret; + + kvm_msr_buf_reset(cpu); + +@@ -2722,17 +2768,6 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + } + #endif + +- /* If host supports feature MSR, write down. */ +- if (has_msr_arch_capabs) { +- kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, +- env->features[FEAT_ARCH_CAPABILITIES]); +- } +- +- if (has_msr_core_capabs) { +- kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, +- env->features[FEAT_CORE_CAPABILITY]); +- } +- + /* + * The following MSRs have side effects on the guest or are too heavy + * for normal writeback. Limit them to reset or full state updates. +@@ -2910,14 +2945,6 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + + /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see + * kvm_put_msr_feature_control. */ +- +- /* +- * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but +- * all kernels with MSR features should have them. +- */ +- if (kvm_feature_msrs && cpu_has_vmx(env)) { +- kvm_msr_entry_add_vmx(cpu, env->features); +- } + } + + if (env->mcg_cap) { +@@ -2933,19 +2960,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + } + } + +- ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); +- if (ret < 0) { +- return ret; +- } +- +- if (ret < cpu->kvm_msr_buf->nmsrs) { +- struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; +- error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, +- (uint32_t)e->index, (uint64_t)e->data); +- } +- +- assert(ret == cpu->kvm_msr_buf->nmsrs); +- return 0; ++ return kvm_buf_set_msrs(cpu); + } + + +diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h +index 06fe06b..d98c6f6 100644 +--- a/target/i386/kvm_i386.h ++++ b/target/i386/kvm_i386.h +@@ -66,4 +66,5 @@ bool kvm_enable_x2apic(void); + bool kvm_has_x2apic_api(void); + + bool kvm_hv_vpindex_settable(void); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch b/kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch new file mode 100755 index 0000000..99b18fc --- /dev/null +++ b/kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch @@ -0,0 +1,64 @@ +From 8f39b0c9523630efeb451e2298cf64b88cd2ac81 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:13 +0000 +Subject: [PATCH 3/9] target/i386: kvm: initialize microcode revision from KVM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-4-pbonzini@redhat.com> +Patchwork-id: 93897 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/6] target/i386: kvm: initialize microcode revision from KVM +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +KVM can return the host microcode revision as a feature MSR. +Use it as the default value for -cpu host. + +Signed-off-by: Paolo Bonzini +Message-Id: <1579544504-3616-4-git-send-email-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 32c87d70ff55b96741f08c35108935cac6f40fe4) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 4 ++++ + target/i386/kvm.c | 5 +++++ + 2 files changed, 9 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index e505d3e..5ac843d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6323,6 +6323,10 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + &cpu->mwait.ecx, &cpu->mwait.edx); + env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR; + } ++ if (kvm_enabled() && cpu->ucode_rev == 0) { ++ cpu->ucode_rev = kvm_arch_get_supported_msr_feature(kvm_state, ++ MSR_IA32_UCODE_REV); ++ } + } + + if (cpu->ucode_rev == 0) { +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index f41605b..6c61aef 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -2696,6 +2696,11 @@ static void kvm_init_msrs(X86CPU *cpu) + env->features[FEAT_CORE_CAPABILITY]); + } + ++ if (kvm_arch_get_supported_msr_feature(kvm_state, ++ MSR_IA32_UCODE_REV)) { ++ kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); ++ } ++ + /* + * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but + * all kernels with MSR features should have them. +-- +1.8.3.1 + diff --git a/kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch b/kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch new file mode 100755 index 0000000..49e54ba --- /dev/null +++ b/kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch @@ -0,0 +1,69 @@ +From 72a1827006be22791017ff2b671eac1c96be5d12 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 7 May 2020 22:09:23 +0100 +Subject: [PATCH 01/26] target/i386: set the CPUID level to 0x14 on old + machine-type + +RH-Author: plai@redhat.com +Message-id: <20200507220923.13723-1-plai@redhat.com> +Patchwork-id: 96347 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH RESEND] target/i386: set the CPUID level to 0x14 on old machine-type +Bugzilla: 1513681 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Igor Mammedov +RH-Acked-by: Danilo de Paula + +From: Luwei Kang + +BZ https://bugzilla.redhat.com/show_bug.cgi?id=1513681 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=28146304 +Branch: rhel-av-8.2.1 + +Tested on intel-icelake-y-01.ml3.eng.bos.redhat.com. + +The CPUID level need to be set to 0x14 manually on old +machine-type if Intel PT is enabled in guest. E.g. the +CPUID[0].EAX(level)=7 and CPUID[7].EBX[25](intel-pt)=1 when the +Qemu with "-machine pc-i440fx-3.1 -cpu qemu64,+intel-pt" parameter. + +Some Intel PT capabilities are exposed by leaf 0x14 and the +missing capabilities will cause some MSRs access failed. +This patch add a warning message to inform the user to extend +the CPUID level. + +Suggested-by: Eduardo Habkost +Signed-off-by: Luwei Kang +Message-Id: <1584031686-16444-1-git-send-email-luwei.kang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit ddc2fc9e4e42ebce48b088963dc7fbd1c08d5f33) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 1685a8c..0f0a2db 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6206,9 +6206,14 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error **errp) + x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE); + + /* Intel Processor Trace requires CPUID[0x14] */ +- if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) && +- kvm_enabled() && cpu->intel_pt_auto_level) { +- x86_cpu_adjust_level(cpu, &cpu->env.cpuid_min_level, 0x14); ++ if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT)) { ++ if (cpu->intel_pt_auto_level) { ++ x86_cpu_adjust_level(cpu, &cpu->env.cpuid_min_level, 0x14); ++ } else if (cpu->env.cpuid_min_level < 0x14) { ++ mark_unavailable_features(cpu, FEAT_7_0_EBX, ++ CPUID_7_0_EBX_INTEL_PT, ++ "Intel PT need CPUID leaf 0x14, please set by \"-cpu ...,+intel-pt,level=0x14\""); ++ } + } + + /* CPU topology with multi-dies support requires CPUID[0x1F] */ +-- +1.8.3.1 + diff --git a/kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch b/kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch new file mode 100755 index 0000000..60abc1b --- /dev/null +++ b/kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch @@ -0,0 +1,56 @@ +From 9adf5e57df32df464e7465b1df72c993d0ed4ed4 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 31 Jul 2020 18:08:35 -0400 +Subject: [PATCH 3/4] target/i386: sev: fail query-sev-capabilities if QEMU + cannot use SEV +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200731180835.86786-3-pbonzini@redhat.com> +Patchwork-id: 98124 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/2] target/i386: sev: fail query-sev-capabilities if QEMU cannot use SEV +Bugzilla: 1689341 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Dr. David Alan Gilbert + +In some cases, such as if the kvm-amd "sev" module parameter is set +to 0, SEV will be unavailable but query-sev-capabilities will still +return all the information. This tricks libvirt into erroneously +reporting that SEV is available. Check the actual usability of the +feature and return the appropriate error if QEMU cannot use KVM +or KVM cannot use SEV. + +Reviewed-by: Eric Blake +Signed-off-by: Paolo Bonzini +cherry picked from commit 1b38750c40281dd0d068f8536b2ea95d7b9bd585 +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/sev.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/target/i386/sev.c b/target/i386/sev.c +index 054f2d846a..a47f0d3880 100644 +--- a/target/i386/sev.c ++++ b/target/i386/sev.c +@@ -504,6 +504,15 @@ sev_get_capabilities(Error **errp) + uint32_t ebx; + int fd; + ++ if (!kvm_enabled()) { ++ error_setg(errp, "KVM not enabled"); ++ return NULL; ++ } ++ if (kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, NULL) < 0) { ++ error_setg(errp, "SEV is not enabled in KVM"); ++ return NULL; ++ } ++ + fd = open(DEFAULT_SEV_DEVICE, O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, "Failed to open %s", +-- +2.27.0 + diff --git a/kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch b/kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch new file mode 100755 index 0000000..e5f3459 --- /dev/null +++ b/kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch @@ -0,0 +1,142 @@ +From 8789f2662c6ddacc5472a803d253b94d93c6e9f0 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 31 Jul 2020 18:08:34 -0400 +Subject: [PATCH 2/4] target/i386: sev: provide proper error reporting for + query-sev-capabilities +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200731180835.86786-2-pbonzini@redhat.com> +Patchwork-id: 98123 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/2] target/i386: sev: provide proper error reporting for query-sev-capabilities +Bugzilla: 1689341 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Dr. David Alan Gilbert + +The query-sev-capabilities was reporting errors through error_report; +change it to use Error** so that the cause of the failure is clearer. + +Reviewed-by: Eric Blake +Signed-off-by: Paolo Bonzini +Cherry picked from commit e4f6278557148151e77260b872b41bcd7ceb4737 +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/monitor.c | 10 +--------- + target/i386/sev-stub.c | 3 ++- + target/i386/sev.c | 18 +++++++++--------- + target/i386/sev_i386.h | 2 +- + 4 files changed, 13 insertions(+), 20 deletions(-) + +diff --git a/target/i386/monitor.c b/target/i386/monitor.c +index 9fb4d641d5..cfd8075e4f 100644 +--- a/target/i386/monitor.c ++++ b/target/i386/monitor.c +@@ -727,13 +727,5 @@ SevLaunchMeasureInfo *qmp_query_sev_launch_measure(Error **errp) + + SevCapability *qmp_query_sev_capabilities(Error **errp) + { +- SevCapability *data; +- +- data = sev_get_capabilities(); +- if (!data) { +- error_setg(errp, "SEV feature is not available"); +- return NULL; +- } +- +- return data; ++ return sev_get_capabilities(errp); + } +diff --git a/target/i386/sev-stub.c b/target/i386/sev-stub.c +index e5ee13309c..88e3f39a1e 100644 +--- a/target/i386/sev-stub.c ++++ b/target/i386/sev-stub.c +@@ -44,7 +44,8 @@ char *sev_get_launch_measurement(void) + return NULL; + } + +-SevCapability *sev_get_capabilities(void) ++SevCapability *sev_get_capabilities(Error **errp) + { ++ error_setg(errp, "SEV is not available in this QEMU"); + return NULL; + } +diff --git a/target/i386/sev.c b/target/i386/sev.c +index 024bb24e51..054f2d846a 100644 +--- a/target/i386/sev.c ++++ b/target/i386/sev.c +@@ -453,7 +453,7 @@ sev_get_info(void) + + static int + sev_get_pdh_info(int fd, guchar **pdh, size_t *pdh_len, guchar **cert_chain, +- size_t *cert_chain_len) ++ size_t *cert_chain_len, Error **errp) + { + guchar *pdh_data = NULL; + guchar *cert_chain_data = NULL; +@@ -464,8 +464,8 @@ sev_get_pdh_info(int fd, guchar **pdh, size_t *pdh_len, guchar **cert_chain, + r = sev_platform_ioctl(fd, SEV_PDH_CERT_EXPORT, &export, &err); + if (r < 0) { + if (err != SEV_RET_INVALID_LEN) { +- error_report("failed to export PDH cert ret=%d fw_err=%d (%s)", +- r, err, fw_error_to_str(err)); ++ error_setg(errp, "failed to export PDH cert ret=%d fw_err=%d (%s)", ++ r, err, fw_error_to_str(err)); + return 1; + } + } +@@ -477,8 +477,8 @@ sev_get_pdh_info(int fd, guchar **pdh, size_t *pdh_len, guchar **cert_chain, + + r = sev_platform_ioctl(fd, SEV_PDH_CERT_EXPORT, &export, &err); + if (r < 0) { +- error_report("failed to export PDH cert ret=%d fw_err=%d (%s)", +- r, err, fw_error_to_str(err)); ++ error_setg(errp, "failed to export PDH cert ret=%d fw_err=%d (%s)", ++ r, err, fw_error_to_str(err)); + goto e_free; + } + +@@ -495,7 +495,7 @@ e_free: + } + + SevCapability * +-sev_get_capabilities(void) ++sev_get_capabilities(Error **errp) + { + SevCapability *cap = NULL; + guchar *pdh_data = NULL; +@@ -506,13 +506,13 @@ sev_get_capabilities(void) + + fd = open(DEFAULT_SEV_DEVICE, O_RDWR); + if (fd < 0) { +- error_report("%s: Failed to open %s '%s'", __func__, +- DEFAULT_SEV_DEVICE, strerror(errno)); ++ error_setg_errno(errp, errno, "Failed to open %s", ++ DEFAULT_SEV_DEVICE); + return NULL; + } + + if (sev_get_pdh_info(fd, &pdh_data, &pdh_len, +- &cert_chain_data, &cert_chain_len)) { ++ &cert_chain_data, &cert_chain_len, errp)) { + goto out; + } + +diff --git a/target/i386/sev_i386.h b/target/i386/sev_i386.h +index 8ada9d385d..1e073342ba 100644 +--- a/target/i386/sev_i386.h ++++ b/target/i386/sev_i386.h +@@ -38,7 +38,7 @@ extern SevInfo *sev_get_info(void); + extern uint32_t sev_get_cbit_position(void); + extern uint32_t sev_get_reduced_phys_bits(void); + extern char *sev_get_launch_measurement(void); +-extern SevCapability *sev_get_capabilities(void); ++extern SevCapability *sev_get_capabilities(Error **errp); + + typedef struct QSevGuestInfo QSevGuestInfo; + typedef struct QSevGuestInfoClass QSevGuestInfoClass; +-- +2.27.0 + diff --git a/kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch b/kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch new file mode 100755 index 0000000..38e5637 --- /dev/null +++ b/kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch @@ -0,0 +1,60 @@ +From c4fe37ae6d75ed72e6a3bde01fea053eb508274c Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Jun 2020 07:41:11 -0400 +Subject: [PATCH 41/42] target/s390x/kvm: Enable adapter interruption + suppression again +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200605074111.2185-4-thuth@redhat.com> +Patchwork-id: 97370 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 3/3] target/s390x/kvm: Enable adapter interruption suppression again +Bugzilla: 1756946 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +The AIS feature has been disabled late in the v2.10 development cycle since +there were some issues with migration (see commit 3f2d07b3b01ea61126b - +"s390x/ais: for 2.10 stable: disable ais facility"). We originally wanted +to enable it again for newer machine types, but apparently we forgot to do +this so far. Let's do it now for the machines that support proper CPU models. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1756946 +Signed-off-by: Thomas Huth +Message-Id: <20200122101437.5069-1-thuth@redhat.com> +Reviewed-by: David Hildenbrand +Tested-by: Matthew Rosato +Signed-off-by: Cornelia Huck +(cherry picked from commit a5c8617af6919515b84256978452edf07401c45e) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index c589ef9034..0bbf8f81b0 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -377,10 +377,13 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + /* + * The migration interface for ais was introduced with kernel 4.13 + * but the capability itself had been active since 4.12. As migration +- * support is considered necessary let's disable ais in the 2.10 +- * machine. ++ * support is considered necessary, we only try to enable this for ++ * newer machine types if KVM_CAP_S390_AIS_MIGRATION is available. + */ +- /* kvm_vm_enable_cap(s, KVM_CAP_S390_AIS, 0); */ ++ if (cpu_model_allowed() && kvm_kernel_irqchip_allowed() && ++ kvm_check_extension(s, KVM_CAP_S390_AIS_MIGRATION)) { ++ kvm_vm_enable_cap(s, KVM_CAP_S390_AIS, 0); ++ } + + kvm_set_max_memslot_size(KVM_SLOT_MAX_BYTES); + return 0; +-- +2.27.0 + diff --git a/kvm-tcp_emu-Fix-oob-access.patch b/kvm-tcp_emu-Fix-oob-access.patch new file mode 100755 index 0000000..e532877 --- /dev/null +++ b/kvm-tcp_emu-Fix-oob-access.patch @@ -0,0 +1,59 @@ +From 5c2c5496083fa549e1dff903413bb6136fc19d8d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 17 Jan 2020 12:07:56 +0100 +Subject: [PATCH 1/4] tcp_emu: Fix oob access +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200117120758.1076549-2-marcandre.lureau@redhat.com> +Patchwork-id: 93399 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm + RHEL-AV-8.2.0 qemu-kvm PATCH 1/3] tcp_emu: Fix oob access +Bugzilla: 1791568 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +From: Samuel Thibault + +The main loop only checks for one available byte, while we sometimes +need two bytes. + +[ MA - minor conflict, CHANGELOG.md absent ] +(cherry picked from libslirp commit 2655fffed7a9e765bcb4701dd876e9dab975f289) +Signed-off-by: Marc-André Lureau + +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tcp_subr.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index d6dd133..cbecd64 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -886,6 +886,9 @@ int tcp_emu(struct socket *so, struct mbuf *m) + break; + + case 5: ++ if (bptr == m->m_data + m->m_len - 1) ++ return 1; /* We need two bytes */ ++ + /* + * The difference between versions 1.0 and + * 2.0 is here. For future versions of +@@ -901,6 +904,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + /* This is the field containing the port + * number that RA-player is listening to. + */ ++ ++ if (bptr == m->m_data + m->m_len - 1) ++ return 1; /* We need two bytes */ ++ + lport = (((uint8_t *)bptr)[0] << 8) + ((uint8_t *)bptr)[1]; + if (lport < 6970) + lport += 256; /* don't know why */ +-- +1.8.3.1 + diff --git a/kvm-tcp_emu-fix-unsafe-snprintf-usages.patch b/kvm-tcp_emu-fix-unsafe-snprintf-usages.patch new file mode 100755 index 0000000..846da73 --- /dev/null +++ b/kvm-tcp_emu-fix-unsafe-snprintf-usages.patch @@ -0,0 +1,149 @@ +From 9a7810c257711ce02627916d886fc1029f7a8190 Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Thu, 13 Feb 2020 15:50:49 +0000 +Subject: [PATCH 3/7] tcp_emu: fix unsafe snprintf() usages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200213155049.3936-3-jmaloy@redhat.com> +Patchwork-id: 93826 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] tcp_emu: fix unsafe snprintf() usages +Bugzilla: 1798994 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Various calls to snprintf() assume that snprintf() returns "only" the +number of bytes written (excluding terminating NUL). + +https://pubs.opengroup.org/onlinepubs/9699919799/functions/snprintf.html#tag_16_159_04 + +"Upon successful completion, the snprintf() function shall return the +number of bytes that would be written to s had n been sufficiently +large excluding the terminating null byte." + +Before patch ce131029, if there isn't enough room in "m_data" for the +"DCC ..." message, we overflow "m_data". + +After the patch, if there isn't enough room for the same, we don't +overflow "m_data", but we set "m_len" out-of-bounds. The next time an +access is bounded by "m_len", we'll have a buffer overflow then. + +Use slirp_fmt*() to fix potential OOB memory access. + +Reported-by: Laszlo Ersek +Signed-off-by: Marc-André Lureau +Reviewed-by: Samuel Thibault +Message-Id: <20200127092414.169796-7-marcandre.lureau@redhat.com> +(cherry picked from libslirp commit 68ccb8021a838066f0951d4b2817eb6b6f10a843) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/tcp_subr.c | 44 +++++++++++++++++++++----------------------- + 1 file changed, 21 insertions(+), 23 deletions(-) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index 954d1a6..26d4ead 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -655,8 +655,7 @@ int tcp_emu(struct socket *so, struct mbuf *m) + NTOHS(n1); + NTOHS(n2); + m_inc(m, snprintf(NULL, 0, "%d,%d\r\n", n1, n2) + 1); +- m->m_len = snprintf(m->m_data, M_ROOM(m), "%d,%d\r\n", n1, n2); +- assert(m->m_len < M_ROOM(m)); ++ m->m_len = slirp_fmt(m->m_data, M_ROOM(m), "%d,%d\r\n", n1, n2); + } else { + *eol = '\r'; + } +@@ -696,9 +695,9 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "ORT %d,%d,%d,%d,%d,%d\r\n%s", n1, n2, n3, n4, +- n5, n6, x == 7 ? buff : ""); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "ORT %d,%d,%d,%d,%d,%d\r\n%s", ++ n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); + return 1; + } else if ((bptr = (char *)strstr(m->m_data, "27 Entering")) != NULL) { + /* +@@ -731,10 +730,9 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "27 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n%s", +- n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); +- ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "27 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n%s", ++ n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); + return 1; + } + +@@ -757,8 +755,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + if (m->m_data[m->m_len - 1] == '\0' && lport != 0 && + (so = tcp_listen(slirp, INADDR_ANY, 0, so->so_laddr.s_addr, + htons(lport), SS_FACCEPTONCE)) != NULL) +- m->m_len = snprintf(m->m_data, M_ROOM(m), +- "%d", ntohs(so->so_fport)) + 1; ++ m->m_len = slirp_fmt0(m->m_data, M_ROOM(m), ++ "%d", ntohs(so->so_fport)); + return 1; + + case EMU_IRC: +@@ -777,10 +775,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "DCC CHAT chat %lu %u%c\n", +- (unsigned long)ntohl(so->so_faddr.s_addr), +- ntohs(so->so_fport), 1); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "DCC CHAT chat %lu %u%c\n", ++ (unsigned long)ntohl(so->so_faddr.s_addr), ++ ntohs(so->so_fport), 1); + } else if (sscanf(bptr, "DCC SEND %256s %u %u %u", buff, &laddr, &lport, + &n1) == 4) { + if ((so = tcp_listen(slirp, INADDR_ANY, 0, htonl(laddr), +@@ -788,10 +786,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "DCC SEND %s %lu %u %u%c\n", buff, +- (unsigned long)ntohl(so->so_faddr.s_addr), +- ntohs(so->so_fport), n1, 1); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "DCC SEND %s %lu %u %u%c\n", buff, ++ (unsigned long)ntohl(so->so_faddr.s_addr), ++ ntohs(so->so_fport), n1, 1); + } else if (sscanf(bptr, "DCC MOVE %256s %u %u %u", buff, &laddr, &lport, + &n1) == 4) { + if ((so = tcp_listen(slirp, INADDR_ANY, 0, htonl(laddr), +@@ -799,10 +797,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "DCC MOVE %s %lu %u %u%c\n", buff, +- (unsigned long)ntohl(so->so_faddr.s_addr), +- ntohs(so->so_fport), n1, 1); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "DCC MOVE %s %lu %u %u%c\n", buff, ++ (unsigned long)ntohl(so->so_faddr.s_addr), ++ ntohs(so->so_fport), n1, 1); + } + return 1; + +-- +1.8.3.1 + diff --git a/kvm-tests-arm-cpu-features-Check-feature-default-values.patch b/kvm-tests-arm-cpu-features-Check-feature-default-values.patch new file mode 100755 index 0000000..e8a48bf --- /dev/null +++ b/kvm-tests-arm-cpu-features-Check-feature-default-values.patch @@ -0,0 +1,106 @@ +From 323889aa2182bf39df10f1caf43f22daea2d7d37 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:12 +0000 +Subject: [PATCH 10/15] tests/arm-cpu-features: Check feature default values +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-4-drjones@redhat.com> +Patchwork-id: 93626 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/5] tests/arm-cpu-features: Check feature default values +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + tests/arm-cpu-features: Check feature default values + + If we know what the default value should be then we can test for + that as well as the feature existence. + + Signed-off-by: Andrew Jones + Reviewed-by: Richard Henderson + Message-id: 20200120101023.16030-5-drjones@redhat.com + Signed-off-by: Peter Maydell + +(cherry picked from commit 789a35efb583464f9fcd5d871a7fd6164318bb91) +Signed-off-by: Danilo C. L. de Paula +--- + tests/arm-cpu-features.c | 37 ++++++++++++++++++++++++++++--------- + 1 file changed, 28 insertions(+), 9 deletions(-) + +diff --git a/tests/arm-cpu-features.c b/tests/arm-cpu-features.c +index 6e99aa9..89285ca 100644 +--- a/tests/arm-cpu-features.c ++++ b/tests/arm-cpu-features.c +@@ -159,6 +159,25 @@ static bool resp_get_feature(QDict *resp, const char *feature) + qobject_unref(_resp); \ + }) + ++#define assert_feature(qts, cpu_type, feature, expected_value) \ ++({ \ ++ QDict *_resp, *_props; \ ++ \ ++ _resp = do_query_no_props(qts, cpu_type); \ ++ g_assert(_resp); \ ++ g_assert(resp_has_props(_resp)); \ ++ _props = resp_get_props(_resp); \ ++ g_assert(qdict_get(_props, feature)); \ ++ g_assert(qdict_get_bool(_props, feature) == (expected_value)); \ ++ qobject_unref(_resp); \ ++}) ++ ++#define assert_has_feature_enabled(qts, cpu_type, feature) \ ++ assert_feature(qts, cpu_type, feature, true) ++ ++#define assert_has_feature_disabled(qts, cpu_type, feature) \ ++ assert_feature(qts, cpu_type, feature, false) ++ + static void assert_type_full(QTestState *qts) + { + const char *error; +@@ -405,16 +424,16 @@ static void test_query_cpu_model_expansion(const void *data) + assert_error(qts, "host", "The CPU type 'host' requires KVM", NULL); + + /* Test expected feature presence/absence for some cpu types */ +- assert_has_feature(qts, "max", "pmu"); +- assert_has_feature(qts, "cortex-a15", "pmu"); ++ assert_has_feature_enabled(qts, "max", "pmu"); ++ assert_has_feature_enabled(qts, "cortex-a15", "pmu"); + assert_has_not_feature(qts, "cortex-a15", "aarch64"); + + if (g_str_equal(qtest_get_arch(), "aarch64")) { +- assert_has_feature(qts, "max", "aarch64"); +- assert_has_feature(qts, "max", "sve"); +- assert_has_feature(qts, "max", "sve128"); +- assert_has_feature(qts, "cortex-a57", "pmu"); +- assert_has_feature(qts, "cortex-a57", "aarch64"); ++ assert_has_feature_enabled(qts, "max", "aarch64"); ++ assert_has_feature_enabled(qts, "max", "sve"); ++ assert_has_feature_enabled(qts, "max", "sve128"); ++ assert_has_feature_enabled(qts, "cortex-a57", "pmu"); ++ assert_has_feature_enabled(qts, "cortex-a57", "aarch64"); + + sve_tests_default(qts, "max"); + +@@ -451,8 +470,8 @@ static void test_query_cpu_model_expansion_kvm(const void *data) + QDict *resp; + char *error; + +- assert_has_feature(qts, "host", "aarch64"); +- assert_has_feature(qts, "host", "pmu"); ++ assert_has_feature_enabled(qts, "host", "aarch64"); ++ assert_has_feature_enabled(qts, "host", "pmu"); + + assert_error(qts, "cortex-a15", + "We cannot guarantee the CPU type 'cortex-a15' works " +-- +1.8.3.1 + diff --git a/kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch b/kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch new file mode 100755 index 0000000..12df637 --- /dev/null +++ b/kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch @@ -0,0 +1,127 @@ +From 6d549629becb69f315dd4213f730122d19c9c566 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:54 +0100 +Subject: [PATCH 11/12] tests/bios-tables-test: add test cases for ACPI HMAT + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-11-plai@redhat.com> +Patchwork-id: 96739 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 10/11] tests/bios-tables-test: add test cases for ACPI HMAT +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Tao Xu + +ACPI table HMAT has been introduced, QEMU now builds HMAT tables for +Heterogeneous Memory with boot option '-numa node'. + +Add test cases on PC and Q35 machines with 2 numa nodes. +Because HMAT is generated when system enable numa, the +following tables need to be added for this test: + tests/data/acpi/pc/APIC.acpihmat + tests/data/acpi/pc/SRAT.acpihmat + tests/data/acpi/pc/HMAT.acpihmat + tests/data/acpi/pc/DSDT.acpihmat + tests/data/acpi/q35/APIC.acpihmat + tests/data/acpi/q35/SRAT.acpihmat + tests/data/acpi/q35/HMAT.acpihmat + tests/data/acpi/q35/DSDT.acpihmat + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Daniel Black +Reviewed-by: Jingqi Liu +Suggested-by: Igor Mammedov +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-9-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 1c8f85d93d261dc555a0aad6f54f2b5e8009d859) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + tests/bios-tables-test-allowed-diff.h | 8 +++++++ + tests/bios-tables-test.c | 44 +++++++++++++++++++++++++++++++++++ + 2 files changed, 52 insertions(+) + +diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h +index dfb8523..3c9e0c9 100644 +--- a/tests/bios-tables-test-allowed-diff.h ++++ b/tests/bios-tables-test-allowed-diff.h +@@ -1 +1,9 @@ + /* List of comma-separated changed AML files to ignore */ ++"tests/data/acpi/pc/APIC.acpihmat", ++"tests/data/acpi/pc/SRAT.acpihmat", ++"tests/data/acpi/pc/HMAT.acpihmat", ++"tests/data/acpi/pc/DSDT.acpihmat", ++"tests/data/acpi/q35/APIC.acpihmat", ++"tests/data/acpi/q35/SRAT.acpihmat", ++"tests/data/acpi/q35/HMAT.acpihmat", ++"tests/data/acpi/q35/DSDT.acpihmat", +diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c +index 79f5da0..9823820 100644 +--- a/tests/bios-tables-test.c ++++ b/tests/bios-tables-test.c +@@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) + + } + ++static void test_acpi_tcg_acpi_hmat(const char *machine) ++{ ++ test_data data; ++ ++ memset(&data, 0, sizeof(data)); ++ data.machine = machine; ++ data.variant = ".acpihmat"; ++ test_acpi_one(" -machine hmat=on" ++ " -smp 2,sockets=2" ++ " -m 128M,slots=2,maxmem=1G" ++ " -object memory-backend-ram,size=64M,id=m0" ++ " -object memory-backend-ram,size=64M,id=m1" ++ " -numa node,nodeid=0,memdev=m0" ++ " -numa node,nodeid=1,memdev=m1,initiator=0" ++ " -numa cpu,node-id=0,socket-id=0" ++ " -numa cpu,node-id=0,socket-id=1" ++ " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," ++ "data-type=access-latency,latency=1" ++ " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," ++ "data-type=access-bandwidth,bandwidth=65534M" ++ " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," ++ "data-type=access-latency,latency=65534" ++ " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," ++ "data-type=access-bandwidth,bandwidth=32767M" ++ " -numa hmat-cache,node-id=0,size=10K,level=1," ++ "associativity=direct,policy=write-back,line=8" ++ " -numa hmat-cache,node-id=1,size=10K,level=1," ++ "associativity=direct,policy=write-back,line=8", ++ &data); ++ free_test_data(&data); ++} ++ ++static void test_acpi_q35_tcg_acpi_hmat(void) ++{ ++ test_acpi_tcg_acpi_hmat(MACHINE_Q35); ++} ++ ++static void test_acpi_piix4_tcg_acpi_hmat(void) ++{ ++ test_acpi_tcg_acpi_hmat(MACHINE_PC); ++} ++ + static void test_acpi_virt_tcg(void) + { + test_data data = { +@@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) + qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem); + qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm); + qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm); ++ qtest_add_func("acpi/piix4/acpihmat", test_acpi_piix4_tcg_acpi_hmat); ++ qtest_add_func("acpi/q35/acpihmat", test_acpi_q35_tcg_acpi_hmat); + } else if (strcmp(arch, "aarch64") == 0) { + qtest_add_func("acpi/virt", test_acpi_virt_tcg); + qtest_add_func("acpi/virt/numamem", test_acpi_virt_tcg_numamem); +-- +1.8.3.1 + diff --git a/kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch b/kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch new file mode 100755 index 0000000..240c408 --- /dev/null +++ b/kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch @@ -0,0 +1,60 @@ +From f73b18e03c6758500bf367b1575205772d1f878f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:52 -0400 +Subject: [PATCH 10/42] tests/boot-sector: Fix the bad s390x assembler code + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-11-thuth@redhat.com> +Patchwork-id: 97031 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 10/38] tests/boot-sector: Fix the bad s390x assembler code +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +There are currently two bugs in s390x_code[]: First, the initial jump +uses the wrong offset, so it was jumping to 0x10014 instead of 0x10010. +Second, LHI only loads the lower 32-bit of the register. + +Everything worked fine as long as the s390-ccw bios code was jumping +here with r3 containing zeroes in the uppermost 48 bit - which just +happened to be the case so far by accident. But we can not rely on this +fact, and indeed one of the recent suggested patches to jump2ipl.c cause +the newer GCCs to put different values into r3. In that case the code +from s390x_code[] crashes very ungracefully. + +Thus let's make sure to jump to the right instruction, and use LGHI +instead of LHI to make sure that we always zero out the upper bits +of the register. + +Signed-off-by: Thomas Huth +Message-Id: <20191217150642.27946-1-thuth@redhat.com> +Reviewed-by: Christian Borntraeger +Signed-off-by: Cornelia Huck +(cherry picked from commit 5afec76fbe2c07d03fd8c9ac525140059499637a) +Signed-off-by: Danilo C. L. de Paula +--- + tests/boot-sector.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tests/boot-sector.c b/tests/boot-sector.c +index 7824286b9a..9e66c6d013 100644 +--- a/tests/boot-sector.c ++++ b/tests/boot-sector.c +@@ -75,11 +75,11 @@ static const uint8_t s390x_psw_and_magic[] = { + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* in the s390-ccw bios */ + }; + static const uint8_t s390x_code[] = { +- 0xa7, 0xf4, 0x00, 0x0a, /* j 0x10010 */ ++ 0xa7, 0xf4, 0x00, 0x08, /* j 0x10010 */ + 0x00, 0x00, 0x00, 0x00, + 'S', '3', '9', '0', + 'E', 'P', 0x00, 0x01, +- 0xa7, 0x38, HIGH(SIGNATURE_ADDR), LOW(SIGNATURE_ADDR), /* lhi r3,0x7c10 */ ++ 0xa7, 0x39, HIGH(SIGNATURE_ADDR), LOW(SIGNATURE_ADDR), /* lghi r3,0x7c10 */ + 0xa7, 0x48, LOW(SIGNATURE), HIGH(SIGNATURE), /* lhi r4,0xadde */ + 0x40, 0x40, 0x30, 0x00, /* sth r4,0(r3) */ + 0xa7, 0xf4, 0xff, 0xfa /* j 0x10010 */ +-- +2.27.0 + diff --git a/kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch b/kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch new file mode 100755 index 0000000..41ee71c --- /dev/null +++ b/kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch @@ -0,0 +1,266 @@ +From 0f11aae02dcabd3a5ee0b5946aec39da6dddea52 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:53 +0100 +Subject: [PATCH 10/12] tests/numa: Add case for QMP build HMAT + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-10-plai@redhat.com> +Patchwork-id: 96735 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 09/11] tests/numa: Add case for QMP build HMAT +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Tao Xu + +Check configuring HMAT usecase + +Acked-by: Markus Armbruster +Suggested-by: Igor Mammedov +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-8-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Igor Mammedov +(cherry picked from commit d00817c944ed15fbe4a61d44fe7f9fe166c7df88) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + tests/numa-test.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 213 insertions(+) + +diff --git a/tests/numa-test.c b/tests/numa-test.c +index 8de8581..17dd807 100644 +--- a/tests/numa-test.c ++++ b/tests/numa-test.c +@@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) + qtest_quit(qs); + } + ++static void pc_hmat_build_cfg(const void *data) ++{ ++ QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " ++ "-smp 2,sockets=2 " ++ "-m 128M,slots=2,maxmem=1G " ++ "-object memory-backend-ram,size=64M,id=m0 " ++ "-object memory-backend-ram,size=64M,id=m1 " ++ "-numa node,nodeid=0,memdev=m0 " ++ "-numa node,nodeid=1,memdev=m1,initiator=0 " ++ "-numa cpu,node-id=0,socket-id=0 " ++ "-numa cpu,node-id=0,socket-id=1", ++ data ? (char *)data : ""); ++ ++ /* Fail: Initiator should be less than the number of nodes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); ++ ++ /* Fail: Target should be less than the number of nodes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); ++ ++ /* Fail: Initiator should contain cpu */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); ++ ++ /* Fail: Data-type mismatch */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"write-latency\"," ++ " 'bandwidth': 524288000 } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," ++ " 'latency': 5 } }"))); ++ ++ /* Fail: Bandwidth should be 1MB (1048576) aligned */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 1048575 } }"))); ++ ++ /* Configuring HMAT bandwidth and latency details */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 1 } }"))); /* 1 ns */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 5 } }"))); /* Fail: Duplicate configuration */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 68717379584 } }"))); /* 65534 MB/s */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 65534 } }"))); /* 65534 ns */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 34358689792 } }"))); /* 32767 MB/s */ ++ ++ /* Fail: node_id should be less than the number of nodes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 2, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* Fail: level should be less than HMAT_LB_LEVELS (4) */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 4, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* Fail: associativity option should be 'none', if level is 0 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"direct\", 'policy': \"none\"," ++ " 'line': 0 } }"))); ++ /* Fail: policy option should be 'none', if level is 0 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"none\", 'policy': \"write-back\"," ++ " 'line': 0 } }"))); ++ /* Fail: line option should be 0, if level is 0 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"none\", 'policy': \"none\"," ++ " 'line': 8 } }"))); ++ ++ /* Configuring HMAT memory side cache attributes */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); /* Fail: Duplicate configuration */ ++ /* Fail: The size of level 2 size should be small than level 1 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 2, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ /* Fail: The size of level 0 size should be larger than level 1 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 1, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* let machine initialization to complete and run */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, ++ "{ 'execute': 'x-exit-preconfig' }"))); ++ qtest_qmp_eventwait(qs, "RESUME"); ++ ++ qtest_quit(qs); ++} ++ ++static void pc_hmat_off_cfg(const void *data) ++{ ++ QTestState *qs = qtest_initf("%s -nodefaults --preconfig " ++ "-smp 2,sockets=2 " ++ "-m 128M,slots=2,maxmem=1G " ++ "-object memory-backend-ram,size=64M,id=m0 " ++ "-object memory-backend-ram,size=64M,id=m1 " ++ "-numa node,nodeid=0,memdev=m0", ++ data ? (char *)data : ""); ++ ++ /* ++ * Fail: Enable HMAT with -machine hmat=on ++ * before using any of hmat specific options ++ */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\"," ++ " 'initiator': 0 } }"))); ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\" } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 1 } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* let machine initialization to complete and run */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, ++ "{ 'execute': 'x-exit-preconfig' }"))); ++ qtest_qmp_eventwait(qs, "RESUME"); ++ ++ qtest_quit(qs); ++} ++ ++static void pc_hmat_erange_cfg(const void *data) ++{ ++ QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " ++ "-smp 2,sockets=2 " ++ "-m 128M,slots=2,maxmem=1G " ++ "-object memory-backend-ram,size=64M,id=m0 " ++ "-object memory-backend-ram,size=64M,id=m1 " ++ "-numa node,nodeid=0,memdev=m0 " ++ "-numa node,nodeid=1,memdev=m1,initiator=0 " ++ "-numa cpu,node-id=0,socket-id=0 " ++ "-numa cpu,node-id=0,socket-id=1", ++ data ? (char *)data : ""); ++ ++ /* Can't store the compressed latency */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 1 } }"))); /* 1 ns */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 65535 } }"))); /* 65535 ns */ ++ ++ /* Test the 0 input (bandwidth not provided) */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 0 } }"))); /* 0 MB/s */ ++ /* Fail: bandwidth should be provided before memory side cache attributes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* Can't store the compressed bandwidth */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 68718428160 } }"))); /* 65535 MB/s */ ++ ++ /* let machine initialization to complete and run */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, ++ "{ 'execute': 'x-exit-preconfig' }"))); ++ qtest_qmp_eventwait(qs, "RESUME"); ++ ++ qtest_quit(qs); ++} ++ + int main(int argc, char **argv) + { + const char *args = NULL; +@@ -346,6 +556,9 @@ int main(int argc, char **argv) + if (!strcmp(arch, "i386") || !strcmp(arch, "x86_64")) { + qtest_add_data_func("/numa/pc/cpu/explicit", args, pc_numa_cpu); + qtest_add_data_func("/numa/pc/dynamic/cpu", args, pc_dynamic_cpu_cfg); ++ qtest_add_data_func("/numa/pc/hmat/build", args, pc_hmat_build_cfg); ++ qtest_add_data_func("/numa/pc/hmat/off", args, pc_hmat_off_cfg); ++ qtest_add_data_func("/numa/pc/hmat/erange", args, pc_hmat_erange_cfg); + } + + if (!strcmp(arch, "ppc64")) { +-- +1.8.3.1 + diff --git a/kvm-tftp-check-tftp_input-buffer-size.patch b/kvm-tftp-check-tftp_input-buffer-size.patch new file mode 100755 index 0000000..85ed811 --- /dev/null +++ b/kvm-tftp-check-tftp_input-buffer-size.patch @@ -0,0 +1,53 @@ +From 6bd4d80f9274f76eb402ce85aa60729150b39980 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:34 -0400 +Subject: [PATCH 09/14] tftp: check tftp_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-6-marcandre.lureau@redhat.com> +Patchwork-id: 101823 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 5/8] tftp: check tftp_input buffer size +Bugzilla: 1970843 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3595 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/46 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970843 + +(cherry picked from commit 3f17948137155f025f7809fdc38576d5d2451c3d) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tftp.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/slirp/src/tftp.c b/slirp/src/tftp.c +index 093c2e06a3..07e8f3cb2f 100644 +--- a/slirp/src/tftp.c ++++ b/slirp/src/tftp.c +@@ -444,7 +444,11 @@ static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, + + void tftp_input(struct sockaddr_storage *srcsas, struct mbuf *m) + { +- struct tftp_t *tp = (struct tftp_t *)m->m_data; ++ struct tftp_t *tp = mtod_check(m, offsetof(struct tftp_t, x.tp_buf)); ++ ++ if (tp == NULL) { ++ return; ++ } + + switch (ntohs(tp->tp_op)) { + case TFTP_RRQ: +-- +2.27.0 + diff --git a/kvm-tftp-introduce-a-header-structure.patch b/kvm-tftp-introduce-a-header-structure.patch new file mode 100755 index 0000000..d8c8ddb --- /dev/null +++ b/kvm-tftp-introduce-a-header-structure.patch @@ -0,0 +1,263 @@ +From af72e344459614fcf2746739f05494ef7e691a78 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:36 -0400 +Subject: [PATCH 10/14] tftp: introduce a header structure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-7-marcandre.lureau@redhat.com> +Patchwork-id: 101825 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 6/8] tftp: introduce a header structure +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Instead of using a composed structure and potentially reading past the +incoming buffer, use a different structure for the header. + +Signed-off-by: Marc-André Lureau + +(cherry picked from commit 990163cf3ac86b7875559f49602c4d76f46f6f30) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tftp.c | 58 +++++++++++++++++++++++++----------------------- + slirp/src/tftp.h | 6 ++++- + 2 files changed, 35 insertions(+), 29 deletions(-) + +diff --git a/slirp/src/tftp.c b/slirp/src/tftp.c +index 07e8f3cb2f..53e04d0aeb 100644 +--- a/slirp/src/tftp.c ++++ b/slirp/src/tftp.c +@@ -50,7 +50,7 @@ static void tftp_session_terminate(struct tftp_session *spt) + } + + static int tftp_session_allocate(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp) ++ struct tftphdr *hdr) + { + struct tftp_session *spt; + int k; +@@ -75,7 +75,7 @@ found: + memcpy(&spt->client_addr, srcsas, sockaddr_size(srcsas)); + spt->fd = -1; + spt->block_size = 512; +- spt->client_port = tp->udp.uh_sport; ++ spt->client_port = hdr->udp.uh_sport; + spt->slirp = slirp; + + tftp_session_update(spt); +@@ -84,7 +84,7 @@ found: + } + + static int tftp_session_find(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp) ++ struct tftphdr *hdr) + { + struct tftp_session *spt; + int k; +@@ -94,7 +94,7 @@ static int tftp_session_find(Slirp *slirp, struct sockaddr_storage *srcsas, + + if (tftp_session_in_use(spt)) { + if (sockaddr_equal(&spt->client_addr, srcsas)) { +- if (spt->client_port == tp->udp.uh_sport) { ++ if (spt->client_port == hdr->udp.uh_sport) { + return k; + } + } +@@ -146,13 +146,13 @@ static struct tftp_t *tftp_prep_mbuf_data(struct tftp_session *spt, + } + + static void tftp_udp_output(struct tftp_session *spt, struct mbuf *m, +- struct tftp_t *recv_tp) ++ struct tftphdr *hdr) + { + if (spt->client_addr.ss_family == AF_INET6) { + struct sockaddr_in6 sa6, da6; + + sa6.sin6_addr = spt->slirp->vhost_addr6; +- sa6.sin6_port = recv_tp->udp.uh_dport; ++ sa6.sin6_port = hdr->udp.uh_dport; + da6.sin6_addr = ((struct sockaddr_in6 *)&spt->client_addr)->sin6_addr; + da6.sin6_port = spt->client_port; + +@@ -161,7 +161,7 @@ static void tftp_udp_output(struct tftp_session *spt, struct mbuf *m, + struct sockaddr_in sa4, da4; + + sa4.sin_addr = spt->slirp->vhost_addr; +- sa4.sin_port = recv_tp->udp.uh_dport; ++ sa4.sin_port = hdr->udp.uh_dport; + da4.sin_addr = ((struct sockaddr_in *)&spt->client_addr)->sin_addr; + da4.sin_port = spt->client_port; + +@@ -183,7 +183,7 @@ static int tftp_send_oack(struct tftp_session *spt, const char *keys[], + + tp = tftp_prep_mbuf_data(spt, m); + +- tp->tp_op = htons(TFTP_OACK); ++ tp->hdr.tp_op = htons(TFTP_OACK); + for (i = 0; i < nb; i++) { + n += snprintf(tp->x.tp_buf + n, sizeof(tp->x.tp_buf) - n, "%s", + keys[i]) + +@@ -195,7 +195,7 @@ static int tftp_send_oack(struct tftp_session *spt, const char *keys[], + + m->m_len = sizeof(struct tftp_t) - (TFTP_BLOCKSIZE_MAX + 2) + n - + sizeof(struct udphdr); +- tftp_udp_output(spt, m, recv_tp); ++ tftp_udp_output(spt, m, &recv_tp->hdr); + + return 0; + } +@@ -216,21 +216,21 @@ static void tftp_send_error(struct tftp_session *spt, uint16_t errorcode, + + tp = tftp_prep_mbuf_data(spt, m); + +- tp->tp_op = htons(TFTP_ERROR); ++ tp->hdr.tp_op = htons(TFTP_ERROR); + tp->x.tp_error.tp_error_code = htons(errorcode); + slirp_pstrcpy((char *)tp->x.tp_error.tp_msg, sizeof(tp->x.tp_error.tp_msg), + msg); + + m->m_len = sizeof(struct tftp_t) - (TFTP_BLOCKSIZE_MAX + 2) + 3 + + strlen(msg) - sizeof(struct udphdr); +- tftp_udp_output(spt, m, recv_tp); ++ tftp_udp_output(spt, m, &recv_tp->hdr); + + out: + tftp_session_terminate(spt); + } + + static void tftp_send_next_block(struct tftp_session *spt, +- struct tftp_t *recv_tp) ++ struct tftphdr *hdr) + { + struct mbuf *m; + struct tftp_t *tp; +@@ -244,7 +244,7 @@ static void tftp_send_next_block(struct tftp_session *spt, + + tp = tftp_prep_mbuf_data(spt, m); + +- tp->tp_op = htons(TFTP_DATA); ++ tp->hdr.tp_op = htons(TFTP_DATA); + tp->x.tp_data.tp_block_nr = htons((spt->block_nr + 1) & 0xffff); + + nobytes = tftp_read_data(spt, spt->block_nr, tp->x.tp_data.tp_buf, +@@ -262,7 +262,7 @@ static void tftp_send_next_block(struct tftp_session *spt, + + m->m_len = sizeof(struct tftp_t) - (TFTP_BLOCKSIZE_MAX - nobytes) - + sizeof(struct udphdr); +- tftp_udp_output(spt, m, recv_tp); ++ tftp_udp_output(spt, m, hdr); + + if (nobytes == spt->block_size) { + tftp_session_update(spt); +@@ -285,12 +285,12 @@ static void tftp_handle_rrq(Slirp *slirp, struct sockaddr_storage *srcsas, + int nb_options = 0; + + /* check if a session already exists and if so terminate it */ +- s = tftp_session_find(slirp, srcsas, tp); ++ s = tftp_session_find(slirp, srcsas, &tp->hdr); + if (s >= 0) { + tftp_session_terminate(&slirp->tftp_sessions[s]); + } + +- s = tftp_session_allocate(slirp, srcsas, tp); ++ s = tftp_session_allocate(slirp, srcsas, &tp->hdr); + + if (s < 0) { + return; +@@ -411,29 +411,29 @@ static void tftp_handle_rrq(Slirp *slirp, struct sockaddr_storage *srcsas, + } + + spt->block_nr = 0; +- tftp_send_next_block(spt, tp); ++ tftp_send_next_block(spt, &tp->hdr); + } + + static void tftp_handle_ack(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp, int pktlen) ++ struct tftphdr *hdr) + { + int s; + +- s = tftp_session_find(slirp, srcsas, tp); ++ s = tftp_session_find(slirp, srcsas, hdr); + + if (s < 0) { + return; + } + +- tftp_send_next_block(&slirp->tftp_sessions[s], tp); ++ tftp_send_next_block(&slirp->tftp_sessions[s], hdr); + } + + static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp, int pktlen) ++ struct tftphdr *hdr) + { + int s; + +- s = tftp_session_find(slirp, srcsas, tp); ++ s = tftp_session_find(slirp, srcsas, hdr); + + if (s < 0) { + return; +@@ -444,23 +444,25 @@ static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, + + void tftp_input(struct sockaddr_storage *srcsas, struct mbuf *m) + { +- struct tftp_t *tp = mtod_check(m, offsetof(struct tftp_t, x.tp_buf)); ++ struct tftphdr *hdr = mtod_check(m, sizeof(struct tftphdr)); + +- if (tp == NULL) { ++ if (hdr == NULL) { + return; + } + +- switch (ntohs(tp->tp_op)) { ++ switch (ntohs(hdr->tp_op)) { + case TFTP_RRQ: +- tftp_handle_rrq(m->slirp, srcsas, tp, m->m_len); ++ tftp_handle_rrq(m->slirp, srcsas, ++ mtod(m, struct tftp_t *), ++ m->m_len); + break; + + case TFTP_ACK: +- tftp_handle_ack(m->slirp, srcsas, tp, m->m_len); ++ tftp_handle_ack(m->slirp, srcsas, hdr); + break; + + case TFTP_ERROR: +- tftp_handle_error(m->slirp, srcsas, tp, m->m_len); ++ tftp_handle_error(m->slirp, srcsas, hdr); + break; + } + } +diff --git a/slirp/src/tftp.h b/slirp/src/tftp.h +index c47bb43c7d..021f6cf109 100644 +--- a/slirp/src/tftp.h ++++ b/slirp/src/tftp.h +@@ -18,9 +18,13 @@ + #define TFTP_FILENAME_MAX 512 + #define TFTP_BLOCKSIZE_MAX 1428 + +-struct tftp_t { ++struct tftphdr { + struct udphdr udp; + uint16_t tp_op; ++} SLIRP_PACKED; ++ ++struct tftp_t { ++ struct tftphdr hdr; + union { + struct { + uint16_t tp_block_nr; +-- +2.27.0 + diff --git a/kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch b/kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch new file mode 100755 index 0000000..3efef47 --- /dev/null +++ b/kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch @@ -0,0 +1,55 @@ +From e483eea891139ee38138381ba6715b3a2be050cc Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:12 +0000 +Subject: [PATCH 16/18] tools/virtiofsd/fuse_lowlevel: Fix + fuse_out_header::error value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-6-dgilbert@redhat.com> +Patchwork-id: 94128 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/7] tools/virtiofsd/fuse_lowlevel: Fix fuse_out_header::error value +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: Philippe Mathieu-Daudé + +Fix warning reported by Clang static code analyzer: + + CC tools/virtiofsd/fuse_lowlevel.o + tools/virtiofsd/fuse_lowlevel.c:195:9: warning: Value stored to 'error' is never read + error = -ERANGE; + ^ ~~~~~~~ + +Fixes: 3db2876 +Reported-by: Clang Static Analyzer +Reviewed-by: Ján Tomko +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 09c086b2a144324199f99a7d4de78c3276a486c1) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_lowlevel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 704c036..2dd36ec 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -192,7 +192,7 @@ int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + + if (error <= -1000 || error > 0) { + fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); +- error = -ERANGE; ++ out.error = -ERANGE; + } + + iov[0].iov_base = &out; +-- +1.8.3.1 + diff --git a/kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch b/kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch new file mode 100755 index 0000000..6af549a --- /dev/null +++ b/kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch @@ -0,0 +1,56 @@ +From 8ce8ccc2a22798a89bac06a37427c3a3cea91a62 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:54 +0100 +Subject: [PATCH 3/9] tools/virtiofsd/passthrough_ll: Fix double close() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-2-dgilbert@redhat.com> +Patchwork-id: 96269 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/7] tools/virtiofsd/passthrough_ll: Fix double close() +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michael S. Tsirkin + +From: Philippe Mathieu-Daudé + +On success, the fdopendir() call closes fd. Later on the error +path we try to close an already-closed fd. This can lead to +use-after-free. Fix by only closing the fd if the fdopendir() +call failed. + +Cc: qemu-stable@nongnu.org +Fixes: b39bce121b (add dirp_map to hide lo_dirp pointers) +Reported-by: Coverity (CID 1421933 USE_AFTER_FREE) +Suggested-by: Peter Maydell +Signed-off-by: Philippe Mathieu-Daudé +Message-Id: <20200321120654.7985-1-philmd@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e1cd92d95cd4f97b3464c4e08cd5b22bf5ca05cb) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9cba3f1..50ff672 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1524,8 +1524,7 @@ out_err: + if (d) { + if (d->dp) { + closedir(d->dp); +- } +- if (fd != -1) { ++ } else if (fd != -1) { + close(fd); + } + free(d); +-- +1.8.3.1 + diff --git a/kvm-tpm-ppi-page-align-PPI-RAM.patch b/kvm-tpm-ppi-page-align-PPI-RAM.patch new file mode 100755 index 0000000..32c971d --- /dev/null +++ b/kvm-tpm-ppi-page-align-PPI-RAM.patch @@ -0,0 +1,58 @@ +From 7cb1c5e1416de9a09180f0930d2a216c77e8cdbd Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 30 Jan 2020 16:01:10 +0000 +Subject: [PATCH 07/15] tpm-ppi: page-align PPI RAM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200130160110.126086-1-marcandre.lureau@redhat.com> +Patchwork-id: 93600 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] tpm-ppi: page-align PPI RAM +Bugzilla: 1787444 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Philippe Mathieu-Daudé + +post-copy migration fails on destination with error such as: +2019-12-26T10:22:44.714644Z qemu-kvm: ram_block_discard_range: +Unaligned start address: 0x559d2afae9a0 + +Use qemu_memalign() to constrain the PPI RAM memory alignment. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Marc-André Lureau +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Stefan Berger +Signed-off-by: Stefan Berger +Message-id: 20200103074000.1006389-3-marcandre.lureau@redhat.com + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1787444 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=26122940 + +(cherry picked from commit 71e415c8a75c130875f14d6b2136825789feb297) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + hw/tpm/tpm_ppi.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c +index ff31459..6d9c1a3 100644 +--- a/hw/tpm/tpm_ppi.c ++++ b/hw/tpm/tpm_ppi.c +@@ -43,7 +43,8 @@ void tpm_ppi_reset(TPMPPI *tpmppi) + void tpm_ppi_init(TPMPPI *tpmppi, struct MemoryRegion *m, + hwaddr addr, Object *obj) + { +- tpmppi->buf = g_malloc0(HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE)); ++ tpmppi->buf = qemu_memalign(qemu_real_host_page_size, ++ HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE)); + memory_region_init_ram_device_ptr(&tpmppi->ram, obj, "tpm-ppi", + TPM_PPI_ADDR_SIZE, tpmppi->buf); + vmstate_register_ram(&tpmppi->ram, DEVICE(obj)); +-- +1.8.3.1 + diff --git a/kvm-trace-update-qemu-trace-stap-to-Python-3.patch b/kvm-trace-update-qemu-trace-stap-to-Python-3.patch new file mode 100755 index 0000000..c49aecd --- /dev/null +++ b/kvm-trace-update-qemu-trace-stap-to-Python-3.patch @@ -0,0 +1,82 @@ +From e7cdcd1e39c4c030a32c9e8ef79316eae8555bc8 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 16 Jan 2020 17:52:48 +0000 +Subject: [PATCH 04/15] trace: update qemu-trace-stap to Python 3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +Message-id: <20200116175248.286556-2-stefanha@redhat.com> +Patchwork-id: 93365 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] trace: update qemu-trace-stap to Python 3 +Bugzilla: 1787395 +RH-Acked-by: John Snow +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Dr. David Alan Gilbert + +qemu-trace-stap does not support Python 3 yet: + + $ scripts/qemu-trace-stap list path/to/qemu-system-x86_64 + Traceback (most recent call last): + File "scripts/qemu-trace-stap", line 175, in + main() + File "scripts/qemu-trace-stap", line 171, in main + args.func(args) + File "scripts/qemu-trace-stap", line 118, in cmd_list + print_probes(args.verbose, "*") + File "scripts/qemu-trace-stap", line 114, in print_probes + if line.startswith(prefix): + TypeError: startswith first arg must be bytes or a tuple of bytes, not str + +Now that QEMU requires Python 3.5 or later we can switch to pure Python +3. Use Popen()'s universal_newlines=True argument to treat stdout as +text instead of binary. + +Fixes: 62dd1048c0bd ("trace: add ability to do simple printf logging via systemtap") +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1787395 +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Message-id: 20200107112438.383958-1-stefanha@redhat.com +Message-Id: <20200107112438.383958-1-stefanha@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 3f0097169bb60268cc5dda0c5ea47c31ab57b22f) +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Danilo C. L. de Paula +--- + scripts/qemu-trace-stap | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/scripts/qemu-trace-stap b/scripts/qemu-trace-stap +index 91d1051..90527eb 100755 +--- a/scripts/qemu-trace-stap ++++ b/scripts/qemu-trace-stap +@@ -1,4 +1,4 @@ +-#!/usr/bin/python ++#!/usr/bin/env python3 + # -*- python -*- + # + # Copyright (C) 2019 Red Hat, Inc +@@ -18,8 +18,6 @@ + # You should have received a copy of the GNU General Public License + # along with this program; if not, see . + +-from __future__ import print_function +- + import argparse + import copy + import os.path +@@ -104,7 +102,9 @@ def cmd_list(args): + if verbose: + print("Listing probes with name '%s'" % script) + proc = subprocess.Popen(["stap", "-l", script], +- stdout=subprocess.PIPE, env=tapset_env(tapsets)) ++ stdout=subprocess.PIPE, ++ universal_newlines=True, ++ env=tapset_env(tapsets)) + out, err = proc.communicate() + if proc.returncode != 0: + print("No probes found, are the tapsets installed in %s" % tapset_dir(args.binary)) +-- +1.8.3.1 + diff --git a/kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch b/kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch new file mode 100755 index 0000000..059445b --- /dev/null +++ b/kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch @@ -0,0 +1,116 @@ +From ba3068eb1a349ec4ed8b7ccdae76450f0c315be9 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 19 Nov 2020 17:23:11 -0500 +Subject: [PATCH 18/18] trace: use STAP_SDT_V2 to work around symbol visibility +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +Message-id: <20201119172311.942629-2-stefanha@redhat.com> +Patchwork-id: 99779 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] trace: use STAP_SDT_V2 to work around symbol visibility +Bugzilla: 1898700 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Philippe Mathieu-Daudé + +QEMU binaries no longer launch successfully with recent SystemTap +releases. This is because modular QEMU builds link the sdt semaphores +into the main binary instead of into the shared objects where they are +used. The symbol visibility of semaphores is 'hidden' and the dynamic +linker prints an error during module loading: + + $ ./configure --enable-trace-backends=dtrace --enable-modules ... + ... + Failed to open module: /builddir/build/BUILD/qemu-4.2.0/s390x-softmmu/../block-curl.so: undefined symbol: qemu_curl_close_semaphore + +The long-term solution is to generate per-module dtrace .o files and +link them into the module instead of the main binary. + +In the short term we can define STAP_SDT_V2 so dtrace(1) produces a .o +file with 'default' symbol visibility instead of 'hidden'. This +workaround is small and easier to merge for QEMU 5.2 and downstream +backports. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1898700 +Cc: wcohen@redhat.com +Cc: fche@redhat.com +Cc: kraxel@redhat.com +Cc: rjones@redhat.com +Cc: ddepaula@redhat.com +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Miroslav Rezanina + +(cherry picked from commit 4b265c79a85bb35abe19aacea6954c1616521639) +Signed-off-by: Stefan Hajnoczi + +Conflicts: + trace/meson.build + Downstream uses makefiles, so move the dtrace invocation changes to + rules.mak and Makefile. +Signed-off-by: Danilo C. L. de Paula +--- + Makefile | 4 ++-- + configure | 7 +++++++ + rules.mak | 2 +- + 3 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/Makefile b/Makefile +index ff05c309497..29b01a13ee3 100644 +--- a/Makefile ++++ b/Makefile +@@ -198,7 +198,7 @@ tracetool-y += $(shell find $(SRC_PATH)/scripts/tracetool -name "*.py") + $< > $@,"GEN","$(@:%-timestamp=%)") + + %/trace-dtrace.h: %/trace-dtrace.dtrace $(tracetool-y) +- $(call quiet-command,dtrace -o $@ -h -s $<, "GEN","$@") ++ $(call quiet-command,dtrace -o $@ -DSTAP_SDT_V2 -h -s $<, "GEN","$@") + + %/trace-dtrace.o: %/trace-dtrace.dtrace $(tracetool-y) + +@@ -258,7 +258,7 @@ trace-dtrace-root.dtrace-timestamp: $(SRC_PATH)/trace-events $(BUILD_DIR)/config + $< > $@,"GEN","$(@:%-timestamp=%)") + + trace-dtrace-root.h: trace-dtrace-root.dtrace +- $(call quiet-command,dtrace -o $@ -h -s $<, "GEN","$@") ++ $(call quiet-command,dtrace -o $@ -DSTAP_SDT_V2 -h -s $<, "GEN","$@") + + trace-dtrace-root.o: trace-dtrace-root.dtrace + +diff --git a/configure b/configure +index 5120c1409a7..c62b61403f6 100755 +--- a/configure ++++ b/configure +@@ -5275,6 +5275,13 @@ if have_backend "dtrace"; then + trace_backend_stap="no" + if has 'stap' ; then + trace_backend_stap="yes" ++ ++ # Workaround to avoid dtrace(1) producing a file with 'hidden' symbol ++ # visibility. Define STAP_SDT_V2 to produce 'default' symbol visibility ++ # instead. QEMU --enable-modules depends on this because the SystemTap ++ # semaphores are linked into the main binary and not the module's shared ++ # object. ++ QEMU_CFLAGS="$QEMU_CFLAGS -DSTAP_SDT_V2" + fi + fi + +diff --git a/rules.mak b/rules.mak +index 967295dd2b6..bdfc223a5a1 100644 +--- a/rules.mak ++++ b/rules.mak +@@ -101,7 +101,7 @@ LINK = $(call quiet-command, $(LINKPROG) $(QEMU_LDFLAGS) $(QEMU_CFLAGS) $(CFLAGS + -c -o $@ $<,"OBJC","$(TARGET_DIR)$@") + + %.o: %.dtrace +- $(call quiet-command,dtrace -o $@ -G -s $<,"GEN","$(TARGET_DIR)$@") ++ $(call quiet-command,dtrace -o $@ -DSTAP_SDT_V2 -G -s $<,"GEN","$(TARGET_DIR)$@") + + DSO_OBJ_CFLAGS := -fPIC -DBUILD_DSO + module-common.o: CFLAGS += $(DSO_OBJ_CFLAGS) +-- +2.27.0 + diff --git a/kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch b/kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch new file mode 100755 index 0000000..4da71cc --- /dev/null +++ b/kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch @@ -0,0 +1,53 @@ +From 87cacc268f37758553ad93fefa8b312ed0bd2520 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:43 -0400 +Subject: [PATCH 5/9] tx_pkt: switch to use qemu_receive_packet_iov() for + loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-6-jmaloy@redhat.com> +Patchwork-id: 101788 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 5/9] tx_pkt: switch to use qemu_receive_packet_iov() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_receive_iov() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 8c552542b81e56ff532dd27ec6e5328954bdda73) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/net_tx_pkt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index 54d4c3bbd0..646cdfaf4d 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -544,7 +544,7 @@ static inline void net_tx_pkt_sendv(struct NetTxPkt *pkt, + NetClientState *nc, const struct iovec *iov, int iov_cnt) + { + if (pkt->is_loopback) { +- nc->info->receive_iov(nc, iov, iov_cnt); ++ qemu_receive_packet_iov(nc, iov, iov_cnt); + } else { + qemu_sendv_packet(nc, iov, iov_cnt); + } +-- +2.27.0 + diff --git a/kvm-udp-check-upd_input-buffer-size.patch b/kvm-udp-check-upd_input-buffer-size.patch new file mode 100755 index 0000000..0f3c6f3 --- /dev/null +++ b/kvm-udp-check-upd_input-buffer-size.patch @@ -0,0 +1,52 @@ +From 1b8aa33b218a8ff3e8aa2f1b6875df40fd70f0ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:40 -0400 +Subject: [PATCH 11/14] udp: check upd_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-8-marcandre.lureau@redhat.com> +Patchwork-id: 101826 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 7/8] udp: check upd_input buffer size +Bugzilla: 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3594 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/47 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970853 + +(cherry picked from commit 74572be49247c8c5feae7c6e0b50c4f569ca9824) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/udp.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/slirp/src/udp.c b/slirp/src/udp.c +index ae23ba4b2a..86142bba14 100644 +--- a/slirp/src/udp.c ++++ b/slirp/src/udp.c +@@ -90,7 +90,10 @@ void udp_input(register struct mbuf *m, int iphlen) + /* + * Get IP and UDP header together in first mbuf. + */ +- ip = mtod(m, struct ip *); ++ ip = mtod_check(m, iphlen + sizeof(struct udphdr)); ++ if (ip == NULL) { ++ goto bad; ++ } + uh = (struct udphdr *)((char *)ip + iphlen); + + /* +-- +2.27.0 + diff --git a/kvm-upd6-check-udp6_input-buffer-size.patch b/kvm-upd6-check-udp6_input-buffer-size.patch new file mode 100755 index 0000000..2aa3a24 --- /dev/null +++ b/kvm-upd6-check-udp6_input-buffer-size.patch @@ -0,0 +1,52 @@ +From 6808086932ddc83fd748c46fea495e7004299b55 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:31 -0400 +Subject: [PATCH 08/14] upd6: check udp6_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-5-marcandre.lureau@redhat.com> +Patchwork-id: 101822 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 4/8] upd6: check udp6_input buffer size +Bugzilla: 1970835 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3593 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/45 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970835 + +(cherry picked from commit de71c15de66ba9350bf62c45b05f8fbff166517b) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/udp6.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/slirp/src/udp6.c b/slirp/src/udp6.c +index 6f9486bbca..8c490e4d10 100644 +--- a/slirp/src/udp6.c ++++ b/slirp/src/udp6.c +@@ -28,7 +28,10 @@ void udp6_input(struct mbuf *m) + ip = mtod(m, struct ip6 *); + m->m_len -= iphlen; + m->m_data += iphlen; +- uh = mtod(m, struct udphdr *); ++ uh = mtod_check(m, sizeof(struct udphdr)); ++ if (uh == NULL) { ++ goto bad; ++ } + m->m_len += iphlen; + m->m_data -= iphlen; + +-- +2.27.0 + diff --git a/kvm-usb-fix-setup_len-init-CVE-2020-14364.patch b/kvm-usb-fix-setup_len-init-CVE-2020-14364.patch new file mode 100755 index 0000000..5e63299 --- /dev/null +++ b/kvm-usb-fix-setup_len-init-CVE-2020-14364.patch @@ -0,0 +1,102 @@ +From feb16ff29a13a4286389bb8b9d4f541aab9b84f1 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 3 Sep 2020 15:27:13 -0400 +Subject: [PATCH] usb: fix setup_len init (CVE-2020-14364) + +RH-Author: Jon Maloy +Message-id: <20200903152713.1420531-2-jmaloy@redhat.com> +Patchwork-id: 98271 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] usb: fix setup_len init (CVE-2020-14364) +Bugzilla: 1869710 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Thomas Huth +RH-Acked-by: Gerd Hoffmann + +From: Gerd Hoffmann + +Store calculated setup_len in a local variable, verify it, and only +write it to the struct (USBDevice->setup_len) in case it passed the +sanity checks. + +This prevents other code (do_token_{in,out} functions specifically) +from working with invalid USBDevice->setup_len values and overrunning +the USBDevice->setup_buf[] buffer. + +Fixes: CVE-2020-14364 +Signed-off-by: Gerd Hoffmann +Tested-by: Gonglei +Reviewed-by: Li Qiang +Message-id: 20200825053636.29648-1-kraxel@redhat.com +(cherry picked from commit b946434f2659a182afc17e155be6791ebfb302eb) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/core.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/hw/usb/core.c b/hw/usb/core.c +index 5abd128b6b..5234dcc73f 100644 +--- a/hw/usb/core.c ++++ b/hw/usb/core.c +@@ -129,6 +129,7 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream) + static void do_token_setup(USBDevice *s, USBPacket *p) + { + int request, value, index; ++ unsigned int setup_len; + + if (p->iov.size != 8) { + p->status = USB_RET_STALL; +@@ -138,14 +139,15 @@ static void do_token_setup(USBDevice *s, USBPacket *p) + usb_packet_copy(p, s->setup_buf, p->iov.size); + s->setup_index = 0; + p->actual_length = 0; +- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; +- if (s->setup_len > sizeof(s->data_buf)) { ++ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; ++ if (setup_len > sizeof(s->data_buf)) { + fprintf(stderr, + "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", +- s->setup_len, sizeof(s->data_buf)); ++ setup_len, sizeof(s->data_buf)); + p->status = USB_RET_STALL; + return; + } ++ s->setup_len = setup_len; + + request = (s->setup_buf[0] << 8) | s->setup_buf[1]; + value = (s->setup_buf[3] << 8) | s->setup_buf[2]; +@@ -259,26 +261,28 @@ static void do_token_out(USBDevice *s, USBPacket *p) + static void do_parameter(USBDevice *s, USBPacket *p) + { + int i, request, value, index; ++ unsigned int setup_len; + + for (i = 0; i < 8; i++) { + s->setup_buf[i] = p->parameter >> (i*8); + } + + s->setup_state = SETUP_STATE_PARAM; +- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; + s->setup_index = 0; + + request = (s->setup_buf[0] << 8) | s->setup_buf[1]; + value = (s->setup_buf[3] << 8) | s->setup_buf[2]; + index = (s->setup_buf[5] << 8) | s->setup_buf[4]; + +- if (s->setup_len > sizeof(s->data_buf)) { ++ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; ++ if (setup_len > sizeof(s->data_buf)) { + fprintf(stderr, + "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", +- s->setup_len, sizeof(s->data_buf)); ++ setup_len, sizeof(s->data_buf)); + p->status = USB_RET_STALL; + return; + } ++ s->setup_len = setup_len; + + if (p->pid == USB_TOKEN_OUT) { + usb_packet_copy(p, s->data_buf, s->setup_len); +-- +2.27.0 + diff --git a/kvm-usbredir-Prevent-recursion-in-usbredir_write.patch b/kvm-usbredir-Prevent-recursion-in-usbredir_write.patch new file mode 100755 index 0000000..8f08256 --- /dev/null +++ b/kvm-usbredir-Prevent-recursion-in-usbredir_write.patch @@ -0,0 +1,106 @@ +From 8f6311159977b8ee4b78172caa411d3cee4d2ae5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 14 Jan 2020 20:23:30 +0000 +Subject: [PATCH 4/5] usbredir: Prevent recursion in usbredir_write +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200114202331.51831-2-dgilbert@redhat.com> +Patchwork-id: 93344 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] usbredir: Prevent recursion in usbredir_write +Bugzilla: 1790844 +RH-Acked-by: Peter Xu +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Gerd Hoffmann + +From: "Dr. David Alan Gilbert" + +I've got a case where usbredir_write manages to call back into itself +via spice; this patch causes the recursion to fail (0 bytes) the write; +this seems to avoid the deadlock I was previously seeing. + +I can't say I fully understand the interaction of usbredir and spice; +but there are a few similar guards in spice and usbredir +to catch other cases especially onces also related to spice_server_char_device_wakeup + +This case seems to be triggered by repeated migration+repeated +reconnection of the viewer; but my debugging suggests the migration +finished before this hits. + +The backtrace of the hang looks like: + reds_handle_ticket + reds_handle_other_links + reds_channel_do_link + red_channel_connect + spicevmc_connect + usbredir_create_parser + usbredirparser_do_write + usbredir_write + qemu_chr_fe_write + qemu_chr_write + qemu_chr_write_buffer + spice_chr_write + spice_server_char_device_wakeup + red_char_device_wakeup + red_char_device_write_to_device + vmc_write + usbredirparser_do_write + usbredir_write + qemu_chr_fe_write + qemu_chr_write + qemu_chr_write_buffer + qemu_mutex_lock_impl + +and we fail as we land through qemu_chr_write_buffer's lock +twice. + +Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1752320 + +Signed-off-by: Dr. David Alan Gilbert +Message-Id: <20191218113012.13331-1-dgilbert@redhat.com> +Signed-off-by: Gerd Hoffmann +(cherry picked from commit 394642a8d3742c885e397d5bb5ee0ec40743cdc6) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/redirect.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c +index e0f5ca6..97f2c3a 100644 +--- a/hw/usb/redirect.c ++++ b/hw/usb/redirect.c +@@ -113,6 +113,7 @@ struct USBRedirDevice { + /* Properties */ + CharBackend cs; + bool enable_streams; ++ bool in_write; + uint8_t debug; + int32_t bootindex; + char *filter_str; +@@ -290,6 +291,13 @@ static int usbredir_write(void *priv, uint8_t *data, int count) + return 0; + } + ++ /* Recursion check */ ++ if (dev->in_write) { ++ DPRINTF("usbredir_write recursion\n"); ++ return 0; ++ } ++ dev->in_write = true; ++ + r = qemu_chr_fe_write(&dev->cs, data, count); + if (r < count) { + if (!dev->watch) { +@@ -300,6 +308,7 @@ static int usbredir_write(void *priv, uint8_t *data, int count) + r = 0; + } + } ++ dev->in_write = false; + return r; + } + +-- +1.8.3.1 + diff --git a/kvm-util-Introduce-qemu_get_host_name.patch b/kvm-util-Introduce-qemu_get_host_name.patch new file mode 100755 index 0000000..da21888 --- /dev/null +++ b/kvm-util-Introduce-qemu_get_host_name.patch @@ -0,0 +1,123 @@ +From 41510fba34cda98cb85a8d04e46dcfdd9a91aa61 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 24 Dec 2020 12:53:03 -0500 +Subject: [PATCH 3/5] util: Introduce qemu_get_host_name() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201224125304.62697-3-marcandre.lureau@redhat.com> +Patchwork-id: 100499 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/3] util: Introduce qemu_get_host_name() +Bugzilla: 1910326 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Michal Privoznik + +This function offers operating system agnostic way to fetch host +name. It is implemented for both POSIX-like and Windows systems. + +Signed-off-by: Michal Privoznik +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Daniel P. Berrangé +Cc: qemu-stable@nongnu.org +Signed-off-by: Michael Roth + +(cherry picked from commit e47f4765afcab2b78dfa5b0115abf64d1d49a5d3) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qemu/osdep.h | 10 ++++++++++ + util/oslib-posix.c | 35 +++++++++++++++++++++++++++++++++++ + util/oslib-win32.c | 13 +++++++++++++ + 3 files changed, 58 insertions(+) + +diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h +index 0f97d68586a..d427e81a427 100644 +--- a/include/qemu/osdep.h ++++ b/include/qemu/osdep.h +@@ -620,4 +620,14 @@ static inline void qemu_reset_optind(void) + #endif + } + ++/** ++ * qemu_get_host_name: ++ * @errp: Error object ++ * ++ * Operating system agnostic way of querying host name. ++ * ++ * Returns allocated hostname (caller should free), NULL on failure. ++ */ ++char *qemu_get_host_name(Error **errp); ++ + #endif +diff --git a/util/oslib-posix.c b/util/oslib-posix.c +index 5a291cc9820..8f88e4dbe10 100644 +--- a/util/oslib-posix.c ++++ b/util/oslib-posix.c +@@ -726,3 +726,38 @@ void sigaction_invoke(struct sigaction *action, + } + action->sa_sigaction(info->ssi_signo, &si, NULL); + } ++ ++#ifndef HOST_NAME_MAX ++# ifdef _POSIX_HOST_NAME_MAX ++# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX ++# else ++# define HOST_NAME_MAX 255 ++# endif ++#endif ++ ++char *qemu_get_host_name(Error **errp) ++{ ++ long len = -1; ++ g_autofree char *hostname = NULL; ++ ++#ifdef _SC_HOST_NAME_MAX ++ len = sysconf(_SC_HOST_NAME_MAX); ++#endif /* _SC_HOST_NAME_MAX */ ++ ++ if (len < 0) { ++ len = HOST_NAME_MAX; ++ } ++ ++ /* Unfortunately, gethostname() below does not guarantee a ++ * NULL terminated string. Therefore, allocate one byte more ++ * to be sure. */ ++ hostname = g_new0(char, len + 1); ++ ++ if (gethostname(hostname, len) < 0) { ++ error_setg_errno(errp, errno, ++ "cannot get hostname"); ++ return NULL; ++ } ++ ++ return g_steal_pointer(&hostname); ++} +diff --git a/util/oslib-win32.c b/util/oslib-win32.c +index e9b14ab1784..3b49d272972 100644 +--- a/util/oslib-win32.c ++++ b/util/oslib-win32.c +@@ -808,3 +808,16 @@ bool qemu_write_pidfile(const char *filename, Error **errp) + } + return true; + } ++ ++char *qemu_get_host_name(Error **errp) ++{ ++ wchar_t tmp[MAX_COMPUTERNAME_LENGTH + 1]; ++ DWORD size = G_N_ELEMENTS(tmp); ++ ++ if (GetComputerNameW(tmp, &size) == 0) { ++ error_setg_win32(errp, GetLastError(), "failed close handle"); ++ return NULL; ++ } ++ ++ return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL); ++} +-- +2.27.0 + diff --git a/kvm-util-add-slirp_fmt-helpers.patch b/kvm-util-add-slirp_fmt-helpers.patch new file mode 100755 index 0000000..31af599 --- /dev/null +++ b/kvm-util-add-slirp_fmt-helpers.patch @@ -0,0 +1,140 @@ +From 5dc50c6bca059a9cda6677b1fd0187df1de78ed7 Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Thu, 13 Feb 2020 15:50:48 +0000 +Subject: [PATCH 2/7] util: add slirp_fmt() helpers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200213155049.3936-2-jmaloy@redhat.com> +Patchwork-id: 93824 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] util: add slirp_fmt() helpers +Bugzilla: 1798994 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Various calls to snprintf() in libslirp assume that snprintf() returns +"only" the number of bytes written (excluding terminating NUL). + +https://pubs.opengroup.org/onlinepubs/9699919799/functions/snprintf.html#tag_16_159_04 + +"Upon successful completion, the snprintf() function shall return the +number of bytes that would be written to s had n been sufficiently +large excluding the terminating null byte." + +Introduce slirp_fmt() that handles several pathological cases the +way libslirp usually expect: + +- treat error as fatal (instead of silently returning -1) + +- fmt0() will always \0 end + +- return the number of bytes actually written (instead of what would +have been written, which would usually result in OOB later), including +the ending \0 for fmt0() + +- warn if truncation happened (instead of ignoring) + +Other less common cases can still be handled with strcpy/snprintf() etc. + +Signed-off-by: Marc-André Lureau +Reviewed-by: Samuel Thibault +Message-Id: <20200127092414.169796-2-marcandre.lureau@redhat.com> +(cherry picked from libslirp commit 30648c03b27fb8d9611b723184216cd3174b6775) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/util.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + slirp/src/util.h | 3 +++ + 2 files changed, 65 insertions(+) + +diff --git a/slirp/src/util.c b/slirp/src/util.c +index e596087..e3b6257 100644 +--- a/slirp/src/util.c ++++ b/slirp/src/util.c +@@ -364,3 +364,65 @@ void slirp_pstrcpy(char *buf, int buf_size, const char *str) + } + *q = '\0'; + } ++ ++static int slirp_vsnprintf(char *str, size_t size, ++ const char *format, va_list args) ++{ ++ int rv = vsnprintf(str, size, format, args); ++ ++ if (rv < 0) { ++ g_error("vsnprintf() failed: %s", g_strerror(errno)); ++ } ++ ++ return rv; ++} ++ ++/* ++ * A snprintf()-like function that: ++ * - returns the number of bytes written (excluding optional \0-ending) ++ * - dies on error ++ * - warn on truncation ++ */ ++int slirp_fmt(char *str, size_t size, const char *format, ...) ++{ ++ va_list args; ++ int rv; ++ ++ va_start(args, format); ++ rv = slirp_vsnprintf(str, size, format, args); ++ va_end(args); ++ ++ if (rv > size) { ++ g_critical("vsnprintf() truncation"); ++ } ++ ++ return MIN(rv, size); ++} ++ ++/* ++ * A snprintf()-like function that: ++ * - always \0-end (unless size == 0) ++ * - returns the number of bytes actually written, including \0 ending ++ * - dies on error ++ * - warn on truncation ++ */ ++int slirp_fmt0(char *str, size_t size, const char *format, ...) ++{ ++ va_list args; ++ int rv; ++ ++ va_start(args, format); ++ rv = slirp_vsnprintf(str, size, format, args); ++ va_end(args); ++ ++ if (rv >= size) { ++ g_critical("vsnprintf() truncation"); ++ if (size > 0) ++ str[size - 1] = '\0'; ++ rv = size; ++ } else { ++ rv += 1; /* include \0 */ ++ } ++ ++ return rv; ++} +diff --git a/slirp/src/util.h b/slirp/src/util.h +index 3c6223c..0558dfc 100644 +--- a/slirp/src/util.h ++++ b/slirp/src/util.h +@@ -177,4 +177,7 @@ static inline int slirp_socket_set_fast_reuse(int fd) + + void slirp_pstrcpy(char *buf, int buf_size, const char *str); + ++int slirp_fmt(char *str, size_t size, const char *format, ...); ++int slirp_fmt0(char *str, size_t size, const char *format, ...); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch b/kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch new file mode 100755 index 0000000..8e58473 --- /dev/null +++ b/kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch @@ -0,0 +1,79 @@ +From f53c2c68db7780353a915072f8c953a74149b1f7 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:42 -0500 +Subject: [PATCH 3/7] vfio: Create shared routine for scanning info + capabilities +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-4-cohuck@redhat.com> +Patchwork-id: 100678 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/7] vfio: Create shared routine for scanning info capabilities +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +Rather than duplicating the same loop in multiple locations, +create a static function to do the work. + +Signed-off-by: Matthew Rosato +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Cornelia Huck +Signed-off-by: Alex Williamson +(cherry picked from commit 3ab7a0b40d4be5ade3b61d4afd1518193b199423) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/common.c | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 5ca11488d67..77d62d2dcdf 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -826,17 +826,12 @@ static void vfio_listener_release(VFIOContainer *container) + } + } + +-struct vfio_info_cap_header * +-vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) ++static struct vfio_info_cap_header * ++vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) + { + struct vfio_info_cap_header *hdr; +- void *ptr = info; +- +- if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { +- return NULL; +- } + +- for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { ++ for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } +@@ -845,6 +840,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) + return NULL; + } + ++struct vfio_info_cap_header * ++vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) ++{ ++ if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { ++ return NULL; ++ } ++ ++ return vfio_get_cap((void *)info, info->cap_offset, id); ++} ++ + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) + { +-- +2.27.0 + diff --git a/kvm-vfio-Find-DMA-available-capability.patch b/kvm-vfio-Find-DMA-available-capability.patch new file mode 100755 index 0000000..b81bcc4 --- /dev/null +++ b/kvm-vfio-Find-DMA-available-capability.patch @@ -0,0 +1,91 @@ +From e6147c5a23a75361b1374bfb4b96403d243b5c38 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:43 -0500 +Subject: [PATCH 4/7] vfio: Find DMA available capability + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-5-cohuck@redhat.com> +Patchwork-id: 100677 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/7] vfio: Find DMA available capability +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +The underlying host may be limiting the number of outstanding DMA +requests for type 1 IOMMU. Add helper functions to check for the +DMA available capability and retrieve the current number of DMA +mappings allowed. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +[aw: vfio_get_info_dma_avail moved inside CONFIG_LINUX] +Signed-off-by: Alex Williamson +(cherry picked from commit 7486a62845b1e12011dd99973e4739f69d57cd38) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/common.c | 31 +++++++++++++++++++++++++++++++ + include/hw/vfio/vfio-common.h | 2 ++ + 2 files changed, 33 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 77d62d2dcdf..23efdfadebd 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -850,6 +850,37 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) + return vfio_get_cap((void *)info, info->cap_offset, id); + } + ++static struct vfio_info_cap_header * ++vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) ++{ ++ if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { ++ return NULL; ++ } ++ ++ return vfio_get_cap((void *)info, info->cap_offset, id); ++} ++ ++bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, ++ unsigned int *avail) ++{ ++ struct vfio_info_cap_header *hdr; ++ struct vfio_iommu_type1_info_dma_avail *cap; ++ ++ /* If the capability cannot be found, assume no DMA limiting */ ++ hdr = vfio_get_iommu_type1_info_cap(info, ++ VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); ++ if (hdr == NULL) { ++ return false; ++ } ++ ++ if (avail != NULL) { ++ cap = (void *) hdr; ++ *avail = cap->avail; ++ } ++ ++ return true; ++} ++ + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) + { +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index fd564209ac7..aa6cbe4a998 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -191,6 +191,8 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type); + struct vfio_info_cap_header * + vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); ++bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, ++ unsigned int *avail); + #endif + extern const MemoryListener vfio_prereg_listener; + +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch b/kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch new file mode 100755 index 0000000..c515676 --- /dev/null +++ b/kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch @@ -0,0 +1,175 @@ +From 58edd0fba4d9e98edfeb16139467d6035a1f4e61 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:42 -0400 +Subject: [PATCH 08/12] vfio-ccw: Add support for the CRW region and IRQ + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-9-cohuck@redhat.com> +Patchwork-id: 97698 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 8/9] vfio-ccw: Add support for the CRW region and IRQ +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Farhan Ali + +The crw region can be used to obtain information about +Channel Report Words (CRW) from vfio-ccw driver. + +Currently only channel-path related CRWs are passed to +QEMU from vfio-ccw driver. + +Signed-off-by: Farhan Ali +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-7-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f030532f2ad6eeb200034915e9c6357cce81b538) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 73 insertions(+) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 94a0d9840d..b72a505893 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -44,7 +44,11 @@ struct VFIOCCWDevice { + uint64_t schib_region_size; + uint64_t schib_region_offset; + struct ccw_schib_region *schib_region; ++ uint64_t crw_region_size; ++ uint64_t crw_region_offset; ++ struct ccw_crw_region *crw_region; + EventNotifier io_notifier; ++ EventNotifier crw_notifier; + bool force_orb_pfch; + bool warned_orb_pfch; + }; +@@ -254,6 +258,44 @@ static void vfio_ccw_reset(DeviceState *dev) + ioctl(vcdev->vdev.fd, VFIO_DEVICE_RESET); + } + ++static void vfio_ccw_crw_read(VFIOCCWDevice *vcdev) ++{ ++ struct ccw_crw_region *region = vcdev->crw_region; ++ CRW crw; ++ int size; ++ ++ /* Keep reading CRWs as long as data is returned */ ++ do { ++ memset(region, 0, sizeof(*region)); ++ size = pread(vcdev->vdev.fd, region, vcdev->crw_region_size, ++ vcdev->crw_region_offset); ++ ++ if (size == -1) { ++ error_report("vfio-ccw: Read crw region failed with errno=%d", ++ errno); ++ break; ++ } ++ ++ if (region->crw == 0) { ++ /* No more CRWs to queue */ ++ break; ++ } ++ ++ memcpy(&crw, ®ion->crw, sizeof(CRW)); ++ ++ css_crw_add_to_queue(crw); ++ } while (1); ++} ++ ++static void vfio_ccw_crw_notifier_handler(void *opaque) ++{ ++ VFIOCCWDevice *vcdev = opaque; ++ ++ while (event_notifier_test_and_clear(&vcdev->crw_notifier)) { ++ vfio_ccw_crw_read(vcdev); ++ } ++} ++ + static void vfio_ccw_io_notifier_handler(void *opaque) + { + VFIOCCWDevice *vcdev = opaque; +@@ -340,6 +382,10 @@ static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, + notifier = &vcdev->io_notifier; + fd_read = vfio_ccw_io_notifier_handler; + break; ++ case VFIO_CCW_CRW_IRQ_INDEX: ++ notifier = &vcdev->crw_notifier; ++ fd_read = vfio_ccw_crw_notifier_handler; ++ break; + default: + error_setg(errp, "vfio: Unsupported device irq(%d)", irq); + return; +@@ -391,6 +437,9 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, + case VFIO_CCW_IO_IRQ_INDEX: + notifier = &vcdev->io_notifier; + break; ++ case VFIO_CCW_CRW_IRQ_INDEX: ++ notifier = &vcdev->crw_notifier; ++ break; + default: + error_report("vfio: Unsupported device irq(%d)", irq); + return; +@@ -468,10 +517,24 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->schib_region = g_malloc(info->size); + } + ++ ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, ++ VFIO_REGION_SUBTYPE_CCW_CRW, &info); ++ ++ if (!ret) { ++ vcdev->crw_region_size = info->size; ++ if (sizeof(*vcdev->crw_region) != vcdev->crw_region_size) { ++ error_setg(errp, "vfio: Unexpected size of the CRW region"); ++ goto out_err; ++ } ++ vcdev->crw_region_offset = info->offset; ++ vcdev->crw_region = g_malloc(info->size); ++ } ++ + g_free(info); + return; + + out_err: ++ g_free(vcdev->crw_region); + g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); +@@ -481,6 +544,7 @@ out_err: + + static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) + { ++ g_free(vcdev->crw_region); + g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); +@@ -596,6 +660,14 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + goto out_notifier_err; + } + ++ if (vcdev->crw_region) { ++ vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX, &err); ++ if (err) { ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); ++ goto out_notifier_err; ++ } ++ } ++ + return; + + out_notifier_err: +@@ -620,6 +692,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); + VFIOGroup *group = vcdev->vdev.group; + ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); + vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); + vfio_ccw_put_region(vcdev); + vfio_ccw_put_device(vcdev); +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Add-support-for-the-schib-region.patch b/kvm-vfio-ccw-Add-support-for-the-schib-region.patch new file mode 100755 index 0000000..667e5cf --- /dev/null +++ b/kvm-vfio-ccw-Add-support-for-the-schib-region.patch @@ -0,0 +1,254 @@ +From b73e3e52f76db823d7bffe3f705f575ca413863b Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:39 -0400 +Subject: [PATCH 05/12] vfio-ccw: Add support for the schib region + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-6-cohuck@redhat.com> +Patchwork-id: 97697 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 5/9] vfio-ccw: Add support for the schib region +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Farhan Ali + +The schib region can be used to obtain the latest SCHIB from the host +passthrough subchannel. Since the guest SCHIB is virtualized, +we currently only update the path related information so that the +guest is aware of any path related changes when it issues the +'stsch' instruction. + +Signed-off-by: Farhan Ali +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-4-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 46ea3841edaff2a7657b8f6c7f474e5e3850cd62) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/css.c | 13 ++++++-- + hw/s390x/s390-ccw.c | 21 +++++++++++++ + hw/vfio/ccw.c | 63 +++++++++++++++++++++++++++++++++++++ + include/hw/s390x/css.h | 3 +- + include/hw/s390x/s390-ccw.h | 1 + + target/s390x/ioinst.c | 3 +- + 6 files changed, 99 insertions(+), 5 deletions(-) + +diff --git a/hw/s390x/css.c b/hw/s390x/css.c +index 844caab408..71fd3f9a00 100644 +--- a/hw/s390x/css.c ++++ b/hw/s390x/css.c +@@ -1335,11 +1335,20 @@ static void copy_schib_to_guest(SCHIB *dest, const SCHIB *src) + } + } + +-int css_do_stsch(SubchDev *sch, SCHIB *schib) ++IOInstEnding css_do_stsch(SubchDev *sch, SCHIB *schib) + { ++ int ret; ++ ++ /* ++ * For some subchannels, we may want to update parts of ++ * the schib (e.g., update path masks from the host device ++ * for passthrough subchannels). ++ */ ++ ret = s390_ccw_store(sch); ++ + /* Use current status. */ + copy_schib_to_guest(schib, &sch->curr_status); +- return 0; ++ return ret; + } + + static void copy_pmcw_from_guest(PMCW *dest, const PMCW *src) +diff --git a/hw/s390x/s390-ccw.c b/hw/s390x/s390-ccw.c +index 0c5a5b60bd..75b788c95e 100644 +--- a/hw/s390x/s390-ccw.c ++++ b/hw/s390x/s390-ccw.c +@@ -51,6 +51,27 @@ int s390_ccw_clear(SubchDev *sch) + return cdc->handle_clear(sch); + } + ++IOInstEnding s390_ccw_store(SubchDev *sch) ++{ ++ S390CCWDeviceClass *cdc = NULL; ++ int ret = IOINST_CC_EXPECTED; ++ ++ /* ++ * This code is called for both virtual and passthrough devices, ++ * but only applies to to the latter. This ugly check makes that ++ * distinction for us. ++ */ ++ if (object_dynamic_cast(OBJECT(sch->driver_data), TYPE_S390_CCW)) { ++ cdc = S390_CCW_DEVICE_GET_CLASS(sch->driver_data); ++ } ++ ++ if (cdc && cdc->handle_store) { ++ ret = cdc->handle_store(sch); ++ } ++ ++ return ret; ++} ++ + static void s390_ccw_get_dev_info(S390CCWDevice *cdev, + char *sysfsdev, + Error **errp) +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 17eb4c4048..859ad646f1 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -41,6 +41,9 @@ struct VFIOCCWDevice { + uint64_t async_cmd_region_size; + uint64_t async_cmd_region_offset; + struct ccw_cmd_region *async_cmd_region; ++ uint64_t schib_region_size; ++ uint64_t schib_region_offset; ++ struct ccw_schib_region *schib_region; + EventNotifier io_notifier; + bool force_orb_pfch; + bool warned_orb_pfch; +@@ -116,6 +119,51 @@ again: + } + } + ++static IOInstEnding vfio_ccw_handle_store(SubchDev *sch) ++{ ++ S390CCWDevice *cdev = sch->driver_data; ++ VFIOCCWDevice *vcdev = DO_UPCAST(VFIOCCWDevice, cdev, cdev); ++ SCHIB *schib = &sch->curr_status; ++ struct ccw_schib_region *region = vcdev->schib_region; ++ SCHIB *s; ++ int ret; ++ ++ /* schib region not available so nothing else to do */ ++ if (!region) { ++ return IOINST_CC_EXPECTED; ++ } ++ ++ memset(region, 0, sizeof(*region)); ++ ret = pread(vcdev->vdev.fd, region, vcdev->schib_region_size, ++ vcdev->schib_region_offset); ++ ++ if (ret == -1) { ++ /* ++ * Device is probably damaged, but store subchannel does not ++ * have a nonzero cc defined for this scenario. Log an error, ++ * and presume things are otherwise fine. ++ */ ++ error_report("vfio-ccw: store region read failed with errno=%d", errno); ++ return IOINST_CC_EXPECTED; ++ } ++ ++ /* ++ * Selectively copy path-related bits of the SCHIB, ++ * rather than copying the entire struct. ++ */ ++ s = (SCHIB *)region->schib_area; ++ schib->pmcw.pnom = s->pmcw.pnom; ++ schib->pmcw.lpum = s->pmcw.lpum; ++ schib->pmcw.pam = s->pmcw.pam; ++ schib->pmcw.pom = s->pmcw.pom; ++ ++ if (s->scsw.flags & SCSW_FLAGS_MASK_PNO) { ++ schib->scsw.flags |= SCSW_FLAGS_MASK_PNO; ++ } ++ ++ return IOINST_CC_EXPECTED; ++} ++ + static int vfio_ccw_handle_clear(SubchDev *sch) + { + S390CCWDevice *cdev = sch->driver_data; +@@ -382,10 +430,23 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->async_cmd_region = g_malloc0(info->size); + } + ++ ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, ++ VFIO_REGION_SUBTYPE_CCW_SCHIB, &info); ++ if (!ret) { ++ vcdev->schib_region_size = info->size; ++ if (sizeof(*vcdev->schib_region) != vcdev->schib_region_size) { ++ error_setg(errp, "vfio: Unexpected size of the schib region"); ++ goto out_err; ++ } ++ vcdev->schib_region_offset = info->offset; ++ vcdev->schib_region = g_malloc(info->size); ++ } ++ + g_free(info); + return; + + out_err: ++ g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); + g_free(info); +@@ -394,6 +455,7 @@ out_err: + + static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) + { ++ g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); + } +@@ -569,6 +631,7 @@ static void vfio_ccw_class_init(ObjectClass *klass, void *data) + cdc->handle_request = vfio_ccw_handle_request; + cdc->handle_halt = vfio_ccw_handle_halt; + cdc->handle_clear = vfio_ccw_handle_clear; ++ cdc->handle_store = vfio_ccw_handle_store; + } + + static const TypeInfo vfio_ccw_info = { +diff --git a/include/hw/s390x/css.h b/include/hw/s390x/css.h +index f46bcafb16..7e3a5e7433 100644 +--- a/include/hw/s390x/css.h ++++ b/include/hw/s390x/css.h +@@ -218,6 +218,7 @@ IOInstEnding do_subchannel_work_passthrough(SubchDev *sub); + + int s390_ccw_halt(SubchDev *sch); + int s390_ccw_clear(SubchDev *sch); ++IOInstEnding s390_ccw_store(SubchDev *sch); + + typedef enum { + CSS_IO_ADAPTER_VIRTIO = 0, +@@ -242,7 +243,7 @@ SubchDev *css_find_subch(uint8_t m, uint8_t cssid, uint8_t ssid, + uint16_t schid); + bool css_subch_visible(SubchDev *sch); + void css_conditional_io_interrupt(SubchDev *sch); +-int css_do_stsch(SubchDev *sch, SCHIB *schib); ++IOInstEnding css_do_stsch(SubchDev *sch, SCHIB *schib); + bool css_schid_final(int m, uint8_t cssid, uint8_t ssid, uint16_t schid); + IOInstEnding css_do_msch(SubchDev *sch, const SCHIB *schib); + IOInstEnding css_do_xsch(SubchDev *sch); +diff --git a/include/hw/s390x/s390-ccw.h b/include/hw/s390x/s390-ccw.h +index fffb54562f..4a43803ef2 100644 +--- a/include/hw/s390x/s390-ccw.h ++++ b/include/hw/s390x/s390-ccw.h +@@ -37,6 +37,7 @@ typedef struct S390CCWDeviceClass { + IOInstEnding (*handle_request) (SubchDev *sch); + int (*handle_halt) (SubchDev *sch); + int (*handle_clear) (SubchDev *sch); ++ IOInstEnding (*handle_store) (SubchDev *sch); + } S390CCWDeviceClass; + + #endif +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index f40c35c6ff..b6be300cc4 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -292,8 +292,7 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + sch = css_find_subch(m, cssid, ssid, schid); + if (sch) { + if (css_subch_visible(sch)) { +- css_do_stsch(sch, &schib); +- cc = 0; ++ cc = css_do_stsch(sch, &schib); + } else { + /* Indicate no more subchannels in this css/ss */ + cc = 3; +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Connect-the-device-request-notifier.patch b/kvm-vfio-ccw-Connect-the-device-request-notifier.patch new file mode 100755 index 0000000..298fb29 --- /dev/null +++ b/kvm-vfio-ccw-Connect-the-device-request-notifier.patch @@ -0,0 +1,128 @@ +From db6a782f8b9ba062f195ff504b4d2f93e471fecc Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 11 May 2021 11:24:05 -0400 +Subject: [PATCH 2/5] vfio-ccw: Connect the device request notifier + +RH-Author: Thomas Huth +Message-id: <20210511112405.297037-3-thuth@redhat.com> +Patchwork-id: 101536 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] vfio-ccw: Connect the device request notifier +Bugzilla: 1940450 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Now that the vfio-ccw code has a notifier interface to request that +a device be unplugged, let's wire that together. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20210104202057.48048-4-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b2f96f9e4f5fbc8f2770a436191cb328da4d5350) +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1940450 +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 40 ++++++++++++++++++++++++++++++++++++---- + 1 file changed, 36 insertions(+), 4 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index b72a505893..3d450fe1c9 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -49,6 +49,7 @@ struct VFIOCCWDevice { + struct ccw_crw_region *crw_region; + EventNotifier io_notifier; + EventNotifier crw_notifier; ++ EventNotifier req_notifier; + bool force_orb_pfch; + bool warned_orb_pfch; + }; +@@ -287,6 +288,21 @@ static void vfio_ccw_crw_read(VFIOCCWDevice *vcdev) + } while (1); + } + ++static void vfio_ccw_req_notifier_handler(void *opaque) ++{ ++ VFIOCCWDevice *vcdev = opaque; ++ Error *err = NULL; ++ ++ if (!event_notifier_test_and_clear(&vcdev->req_notifier)) { ++ return; ++ } ++ ++ qdev_unplug(DEVICE(vcdev), &err); ++ if (err) { ++ warn_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); ++ } ++} ++ + static void vfio_ccw_crw_notifier_handler(void *opaque) + { + VFIOCCWDevice *vcdev = opaque; +@@ -386,6 +402,10 @@ static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, + notifier = &vcdev->crw_notifier; + fd_read = vfio_ccw_crw_notifier_handler; + break; ++ case VFIO_CCW_REQ_IRQ_INDEX: ++ notifier = &vcdev->req_notifier; ++ fd_read = vfio_ccw_req_notifier_handler; ++ break; + default: + error_setg(errp, "vfio: Unsupported device irq(%d)", irq); + return; +@@ -440,6 +460,9 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, + case VFIO_CCW_CRW_IRQ_INDEX: + notifier = &vcdev->crw_notifier; + break; ++ case VFIO_CCW_REQ_IRQ_INDEX: ++ notifier = &vcdev->req_notifier; ++ break; + default: + error_report("vfio: Unsupported device irq(%d)", irq); + return; +@@ -657,20 +680,28 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + + vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, &err); + if (err) { +- goto out_notifier_err; ++ goto out_io_notifier_err; + } + + if (vcdev->crw_region) { + vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX, &err); + if (err) { +- vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); +- goto out_notifier_err; ++ goto out_crw_notifier_err; + } + } + ++ vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX, &err); ++ if (err) { ++ goto out_req_notifier_err; ++ } ++ + return; + +-out_notifier_err: ++out_req_notifier_err: ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); ++out_crw_notifier_err: ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); ++out_io_notifier_err: + vfio_ccw_put_region(vcdev); + out_region_err: + vfio_ccw_put_device(vcdev); +@@ -692,6 +723,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); + VFIOGroup *group = vcdev->vdev.group; + ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX); + vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); + vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); + vfio_ccw_put_region(vcdev); +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Fix-error-message.patch b/kvm-vfio-ccw-Fix-error-message.patch new file mode 100755 index 0000000..86d2fdf --- /dev/null +++ b/kvm-vfio-ccw-Fix-error-message.patch @@ -0,0 +1,48 @@ +From 7258b1fabcd152c2ad9b61485b869a41d1bc64e2 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:35 -0400 +Subject: [PATCH 01/12] vfio-ccw: Fix error message +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-2-cohuck@redhat.com> +Patchwork-id: 97693 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/9] vfio-ccw: Fix error message +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Boris Fiuczynski + +Signed-off-by: Boris Fiuczynski +Reviewed-by: Eric Farman +Message-Id: <20191128143015.5231-1-fiuczy@linux.ibm.com> +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Cornelia Huck +(cherry picked from commit 91f751dc111b270b1e81d80ac92cf479e7620fa4) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 6863f6c69f..3b5520ae75 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -102,7 +102,7 @@ again: + if (errno == EAGAIN) { + goto again; + } +- error_report("vfio-ccw: wirte I/O region failed with errno=%d", errno); ++ error_report("vfio-ccw: write I/O region failed with errno=%d", errno); + ret = -errno; + } else { + ret = region->ret_code; +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Refactor-ccw-irq-handler.patch b/kvm-vfio-ccw-Refactor-ccw-irq-handler.patch new file mode 100755 index 0000000..8a3514d --- /dev/null +++ b/kvm-vfio-ccw-Refactor-ccw-irq-handler.patch @@ -0,0 +1,155 @@ +From ee9b03e774641fba8baaf85256706fcc5e8d8efa Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:40 -0400 +Subject: [PATCH 06/12] vfio-ccw: Refactor ccw irq handler + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-7-cohuck@redhat.com> +Patchwork-id: 97695 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 6/9] vfio-ccw: Refactor ccw irq handler +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Eric Farman + +Make it easier to add new ones in the future. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-5-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 690e29b91102ac69810b35fe72cd90bc9fa1fff7) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 58 +++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 42 insertions(+), 16 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 859ad646f1..94a0d9840d 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -324,22 +324,36 @@ read_err: + css_inject_io_interrupt(sch); + } + +-static void vfio_ccw_register_io_notifier(VFIOCCWDevice *vcdev, Error **errp) ++static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, ++ unsigned int irq, ++ Error **errp) + { + VFIODevice *vdev = &vcdev->vdev; + struct vfio_irq_info *irq_info; + size_t argsz; + int fd; ++ EventNotifier *notifier; ++ IOHandler *fd_read; ++ ++ switch (irq) { ++ case VFIO_CCW_IO_IRQ_INDEX: ++ notifier = &vcdev->io_notifier; ++ fd_read = vfio_ccw_io_notifier_handler; ++ break; ++ default: ++ error_setg(errp, "vfio: Unsupported device irq(%d)", irq); ++ return; ++ } + +- if (vdev->num_irqs < VFIO_CCW_IO_IRQ_INDEX + 1) { +- error_setg(errp, "vfio: unexpected number of io irqs %u", ++ if (vdev->num_irqs < irq + 1) { ++ error_setg(errp, "vfio: unexpected number of irqs %u", + vdev->num_irqs); + return; + } + + argsz = sizeof(*irq_info); + irq_info = g_malloc0(argsz); +- irq_info->index = VFIO_CCW_IO_IRQ_INDEX; ++ irq_info->index = irq; + irq_info->argsz = argsz; + if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, + irq_info) < 0 || irq_info->count < 1) { +@@ -347,37 +361,49 @@ static void vfio_ccw_register_io_notifier(VFIOCCWDevice *vcdev, Error **errp) + goto out_free_info; + } + +- if (event_notifier_init(&vcdev->io_notifier, 0)) { ++ if (event_notifier_init(notifier, 0)) { + error_setg_errno(errp, errno, +- "vfio: Unable to init event notifier for IO"); ++ "vfio: Unable to init event notifier for irq (%d)", ++ irq); + goto out_free_info; + } + +- fd = event_notifier_get_fd(&vcdev->io_notifier); +- qemu_set_fd_handler(fd, vfio_ccw_io_notifier_handler, NULL, vcdev); ++ fd = event_notifier_get_fd(notifier); ++ qemu_set_fd_handler(fd, fd_read, NULL, vcdev); + +- if (vfio_set_irq_signaling(vdev, VFIO_CCW_IO_IRQ_INDEX, 0, ++ if (vfio_set_irq_signaling(vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { + qemu_set_fd_handler(fd, NULL, NULL, vcdev); +- event_notifier_cleanup(&vcdev->io_notifier); ++ event_notifier_cleanup(notifier); + } + + out_free_info: + g_free(irq_info); + } + +-static void vfio_ccw_unregister_io_notifier(VFIOCCWDevice *vcdev) ++static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, ++ unsigned int irq) + { + Error *err = NULL; ++ EventNotifier *notifier; ++ ++ switch (irq) { ++ case VFIO_CCW_IO_IRQ_INDEX: ++ notifier = &vcdev->io_notifier; ++ break; ++ default: ++ error_report("vfio: Unsupported device irq(%d)", irq); ++ return; ++ } + +- if (vfio_set_irq_signaling(&vcdev->vdev, VFIO_CCW_IO_IRQ_INDEX, 0, ++ if (vfio_set_irq_signaling(&vcdev->vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); + } + +- qemu_set_fd_handler(event_notifier_get_fd(&vcdev->io_notifier), ++ qemu_set_fd_handler(event_notifier_get_fd(notifier), + NULL, NULL, vcdev); +- event_notifier_cleanup(&vcdev->io_notifier); ++ event_notifier_cleanup(notifier); + } + + static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) +@@ -565,7 +591,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + goto out_region_err; + } + +- vfio_ccw_register_io_notifier(vcdev, &err); ++ vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, &err); + if (err) { + goto out_notifier_err; + } +@@ -594,7 +620,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); + VFIOGroup *group = vcdev->vdev.group; + +- vfio_ccw_unregister_io_notifier(vcdev); ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); + vfio_ccw_put_region(vcdev); + vfio_ccw_put_device(vcdev); + vfio_put_group(group); +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Refactor-cleanup-of-regions.patch b/kvm-vfio-ccw-Refactor-cleanup-of-regions.patch new file mode 100755 index 0000000..1741f4b --- /dev/null +++ b/kvm-vfio-ccw-Refactor-cleanup-of-regions.patch @@ -0,0 +1,73 @@ +From 30906c9c78af2710a2b86c096cc7b18bbc4b4e69 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:38 -0400 +Subject: [PATCH 04/12] vfio-ccw: Refactor cleanup of regions + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-5-cohuck@redhat.com> +Patchwork-id: 97694 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 4/9] vfio-ccw: Refactor cleanup of regions +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Eric Farman + +While we're at it, add a g_free() for the async_cmd_region that +is the last thing currently created. g_free() knows how to handle +NULL pointers, so this makes it easier to remember what cleanups +need to be performed when new regions are added. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-3-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 2a3b9cbaa7b25a4db4cdcfe1c65279c5464f2923) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 6bc612b5b7..17eb4c4048 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -363,8 +363,7 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->io_region_size = info->size; + if (sizeof(*vcdev->io_region) != vcdev->io_region_size) { + error_setg(errp, "vfio: Unexpected size of the I/O region"); +- g_free(info); +- return; ++ goto out_err; + } + + vcdev->io_region_offset = info->offset; +@@ -377,15 +376,20 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->async_cmd_region_size = info->size; + if (sizeof(*vcdev->async_cmd_region) != vcdev->async_cmd_region_size) { + error_setg(errp, "vfio: Unexpected size of the async cmd region"); +- g_free(vcdev->io_region); +- g_free(info); +- return; ++ goto out_err; + } + vcdev->async_cmd_region_offset = info->offset; + vcdev->async_cmd_region = g_malloc0(info->size); + } + + g_free(info); ++ return; ++ ++out_err: ++ g_free(vcdev->async_cmd_region); ++ g_free(vcdev->io_region); ++ g_free(info); ++ return; + } + + static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) +-- +2.27.0 + diff --git a/kvm-vfio-ccw-allow-non-prefetch-ORBs.patch b/kvm-vfio-ccw-allow-non-prefetch-ORBs.patch new file mode 100755 index 0000000..da2fc5c --- /dev/null +++ b/kvm-vfio-ccw-allow-non-prefetch-ORBs.patch @@ -0,0 +1,61 @@ +From d5f5a307f3396064d29ef0d300c7377756dd165b Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:36 -0400 +Subject: [PATCH 02/12] vfio-ccw: allow non-prefetch ORBs + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-3-cohuck@redhat.com> +Patchwork-id: 97692 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/9] vfio-ccw: allow non-prefetch ORBs +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Jared Rossi + +Remove the explicit prefetch check when using vfio-ccw devices. +This check does not trigger in practice as all Linux channel programs +are intended to use prefetch. + +Newer Linux kernel versions do not require to force the PFCH flag with +vfio-ccw devices anymore. + +Signed-off-by: Jared Rossi +Reviewed-by: Eric Farman +Message-Id: <20200512181535.18630-2-jrossi@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 24e58a7b1d411627e326144030a20dcf0093fed0) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 13 +++---------- + 1 file changed, 3 insertions(+), 10 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 3b5520ae75..6bc612b5b7 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -74,16 +74,9 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch) + struct ccw_io_region *region = vcdev->io_region; + int ret; + +- if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH)) { +- if (!(vcdev->force_orb_pfch)) { +- warn_once_pfch(vcdev, sch, "requires PFCH flag set"); +- sch_gen_unit_exception(sch); +- css_inject_io_interrupt(sch); +- return IOINST_CC_EXPECTED; +- } else { +- sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; +- warn_once_pfch(vcdev, sch, "PFCH flag forced"); +- } ++ if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH) && vcdev->force_orb_pfch) { ++ sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; ++ warn_once_pfch(vcdev, sch, "PFCH flag forced"); + } + + QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB)); +-- +2.27.0 + diff --git a/kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch b/kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch new file mode 100755 index 0000000..81cf80e --- /dev/null +++ b/kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch @@ -0,0 +1,75 @@ +From f01098bb86c12f485895f38f7a24170ec84b60b6 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Mon, 8 Jun 2020 16:25:21 -0400 +Subject: [PATCH 42/42] vfio/nvlink: Remove exec permission to avoid SELinux + AVCs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Greg Kurz +Message-id: <20200608162521.382858-2-gkurz@redhat.com> +Patchwork-id: 97459 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] vfio/nvlink: Remove exec permission to avoid SELinux AVCs +Bugzilla: 1823275 +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier +RH-Acked-by: Philippe Mathieu-Daudé + +From: Leonardo Bras + +If SELinux is setup without 'execmem' permission for qemu, all mmap +with (PROT_WRITE | PROT_EXEC) will fail and print a warning in +SELinux log. + +If "nvlink2-mr" memory allocation fails (fist diff), it will cause +guest NUMA nodes to not be correctly configured (V100 memory will +not be visible for guest, nor its NUMA nodes). + +Not having 'execmem' permission is intesting for virtual machines to +avoid buffer-overflow based attacks, and it's adopted in distros +like RHEL. + +So, removing the PROT_EXEC flag seems the right thing to do. + +Browsing some other code that mmaps memory for usage with +memory_region_init_ram_device_ptr, I could notice it's usual to +not have PROT_EXEC (only PROT_READ | PROT_WRITE), so it should be +no problem around this. + +Signed-off-by: Leonardo Bras +Message-Id: <20200501055448.286518-1-leobras.c@gmail.com> +Acked-by: Alex Williamson +Signed-off-by: David Gibson +(cherry picked from commit 9c7c0407028355ca83349b8a60fddfad46f2ebd8) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/pci-quirks.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c +index 4505ffe48a..1c5fe014cf 100644 +--- a/hw/vfio/pci-quirks.c ++++ b/hw/vfio/pci-quirks.c +@@ -2237,7 +2237,7 @@ int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) + } + cap = (void *) hdr; + +- p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC, ++ p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE, + MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); + if (p == MAP_FAILED) { + ret = -errno; +@@ -2297,7 +2297,7 @@ int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) + + /* Some NVLink bridges may not have assigned ATSD */ + if (atsdreg->size) { +- p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC, ++ p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE, + MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); + if (p == MAP_FAILED) { + ret = -errno; +-- +2.27.0 + diff --git a/kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch b/kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch new file mode 100755 index 0000000..d416e0f --- /dev/null +++ b/kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch @@ -0,0 +1,58 @@ +From e4631c00d8e9ee3608ef3196cbe8bec4841ee988 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 8 Jan 2020 15:04:57 +0000 +Subject: [PATCH 2/5] vfio/pci: Don't remove irqchip notifier if not registered + +RH-Author: Peter Xu +Message-id: <20200108150457.12324-2-peterx@redhat.com> +Patchwork-id: 93291 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] vfio/pci: Don't remove irqchip notifier if not registered +Bugzilla: 1782678 +RH-Acked-by: Alex Williamson +RH-Acked-by: Cornelia Huck +RH-Acked-by: Auger Eric +RH-Acked-by: Jens Freimann + +The kvm irqchip notifier is only registered if the device supports +INTx, however it's unconditionally removed. If the assigned device +does not support INTx, this will cause QEMU to crash when unplugging +the device from the system. Change it to conditionally remove the +notifier only if the notify hook is setup. + +CC: Eduardo Habkost +CC: David Gibson +CC: Alex Williamson +Cc: qemu-stable@nongnu.org # v4.2 +Reported-by: yanghliu@redhat.com +Debugged-by: Eduardo Habkost +Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier") +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1782678 +Signed-off-by: Peter Xu +Reviewed-by: David Gibson +Reviewed-by: Greg Kurz +Signed-off-by: Alex Williamson +(cherry picked from commit 0446f8121723b134ca1d1ed0b73e96d4a0a8689d) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/pci.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 309535f..d717520 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3100,7 +3100,9 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); +- kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); ++ if (vdev->irqchip_change_notifier.notify) { ++ kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); ++ } + vfio_disable_interrupts(vdev); + if (vdev->intx.mmap_timer) { + timer_free(vdev->intx.mmap_timer); +-- +1.8.3.1 + diff --git a/kvm-vhost-Add-names-to-section-rounded-warning.patch b/kvm-vhost-Add-names-to-section-rounded-warning.patch new file mode 100755 index 0000000..c41a14c --- /dev/null +++ b/kvm-vhost-Add-names-to-section-rounded-warning.patch @@ -0,0 +1,53 @@ +From 0d545c5850caf76ad3e8dd9bb0fbc9f86b08e220 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 24 Jan 2020 19:46:11 +0100 +Subject: [PATCH 002/116] vhost: Add names to section rounded warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200124194613.41119-2-dgilbert@redhat.com> +Patchwork-id: 93450 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 1/3] vhost: Add names to section rounded warning +Bugzilla: 1779041 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Dr. David Alan Gilbert" + +Add the memory region names to section rounding/alignment +warnings. + +Signed-off-by: Dr. David Alan Gilbert +Message-Id: <20200116202414.157959-2-dgilbert@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit ff4776147e960b128ee68f94c728659f662f4378) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 4da0d5a..774d87d 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -590,9 +590,10 @@ static void vhost_region_add_section(struct vhost_dev *dev, + * match up in the same RAMBlock if they do. + */ + if (mrs_gpa < prev_gpa_start) { +- error_report("%s:Section rounded to %"PRIx64 +- " prior to previous %"PRIx64, +- __func__, mrs_gpa, prev_gpa_start); ++ error_report("%s:Section '%s' rounded to %"PRIx64 ++ " prior to previous '%s' %"PRIx64, ++ __func__, section->mr->name, mrs_gpa, ++ prev_sec->mr->name, prev_gpa_start); + /* A way to cleanly fail here would be better */ + return; + } +-- +1.8.3.1 + diff --git a/kvm-vhost-Only-align-sections-for-vhost-user.patch b/kvm-vhost-Only-align-sections-for-vhost-user.patch new file mode 100755 index 0000000..e082ce8 --- /dev/null +++ b/kvm-vhost-Only-align-sections-for-vhost-user.patch @@ -0,0 +1,97 @@ +From c35466c168e5219bf585aa65ac31fc9bdc7cbf36 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 24 Jan 2020 19:46:12 +0100 +Subject: [PATCH 003/116] vhost: Only align sections for vhost-user +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200124194613.41119-3-dgilbert@redhat.com> +Patchwork-id: 93452 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 2/3] vhost: Only align sections for vhost-user +Bugzilla: 1779041 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Dr. David Alan Gilbert" + +I added hugepage alignment code in c1ece84e7c9 to deal with +vhost-user + postcopy which needs aligned pages when using userfault. +However, on x86 the lower 2MB of address space tends to be shotgun'd +with small fragments around the 512-640k range - e.g. video RAM, and +with HyperV synic pages tend to sit around there - again splitting +it up. The alignment code complains with a 'Section rounded to ...' +error and gives up. + +Since vhost-user already filters out devices without an fd +(see vhost-user.c vhost_user_mem_section_filter) it shouldn't be +affected by those overlaps. + +Turn the alignment off on vhost-kernel so that it doesn't try +and align, and thus won't hit the rounding issues. + +Signed-off-by: Dr. David Alan Gilbert +Message-Id: <20200116202414.157959-3-dgilbert@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Paolo Bonzini +(cherry picked from commit 76525114736e8f669766e69b715fa59ce8648aae) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost.c | 34 ++++++++++++++++++---------------- + 1 file changed, 18 insertions(+), 16 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 774d87d..25fd469 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -547,26 +547,28 @@ static void vhost_region_add_section(struct vhost_dev *dev, + uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + RAMBlock *mrs_rb = section->mr->ram_block; +- size_t mrs_page = qemu_ram_pagesize(mrs_rb); + + trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, + mrs_host); + +- /* Round the section to it's page size */ +- /* First align the start down to a page boundary */ +- uint64_t alignage = mrs_host & (mrs_page - 1); +- if (alignage) { +- mrs_host -= alignage; +- mrs_size += alignage; +- mrs_gpa -= alignage; +- } +- /* Now align the size up to a page boundary */ +- alignage = mrs_size & (mrs_page - 1); +- if (alignage) { +- mrs_size += mrs_page - alignage; +- } +- trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, +- mrs_host); ++ if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { ++ /* Round the section to it's page size */ ++ /* First align the start down to a page boundary */ ++ size_t mrs_page = qemu_ram_pagesize(mrs_rb); ++ uint64_t alignage = mrs_host & (mrs_page - 1); ++ if (alignage) { ++ mrs_host -= alignage; ++ mrs_size += alignage; ++ mrs_gpa -= alignage; ++ } ++ /* Now align the size up to a page boundary */ ++ alignage = mrs_size & (mrs_page - 1); ++ if (alignage) { ++ mrs_size += mrs_page - alignage; ++ } ++ trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, ++ mrs_host); ++ } + + if (dev->n_tmp_sections) { + /* Since we already have at least one section, lets see if +-- +1.8.3.1 + diff --git a/kvm-vhost-coding-style-fix.patch b/kvm-vhost-coding-style-fix.patch new file mode 100755 index 0000000..4546130 --- /dev/null +++ b/kvm-vhost-coding-style-fix.patch @@ -0,0 +1,56 @@ +From 624d96c456536e1471968a59fbeea206309cc33b Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 24 Jan 2020 19:46:13 +0100 +Subject: [PATCH 004/116] vhost: coding style fix +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200124194613.41119-4-dgilbert@redhat.com> +Patchwork-id: 93453 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 3/3] vhost: coding style fix +Bugzilla: 1779041 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Michael S. Tsirkin" + +Drop a trailing whitespace. Make line shorter. + +Fixes: 76525114736e8 ("vhost: Only align sections for vhost-user") +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 8347505640238d3b80f9bb7510fdc1bb574bad19) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 25fd469..9edfadc 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -551,7 +551,7 @@ static void vhost_region_add_section(struct vhost_dev *dev, + trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, + mrs_host); + +- if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { ++ if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { + /* Round the section to it's page size */ + /* First align the start down to a page boundary */ + size_t mrs_page = qemu_ram_pagesize(mrs_rb); +@@ -566,8 +566,8 @@ static void vhost_region_add_section(struct vhost_dev *dev, + if (alignage) { + mrs_size += mrs_page - alignage; + } +- trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, +- mrs_host); ++ trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, ++ mrs_size, mrs_host); + } + + if (dev->n_tmp_sections) { +-- +1.8.3.1 + diff --git a/kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch b/kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch new file mode 100755 index 0000000..7e1353c --- /dev/null +++ b/kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch @@ -0,0 +1,69 @@ +From e06655cfe0fa9473b1e8b311571f36d787472834 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:02 -0400 +Subject: [PATCH 20/42] vhost: correctly turn on VIRTIO_F_IOMMU_PLATFORM + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-21-thuth@redhat.com> +Patchwork-id: 97041 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 20/38] vhost: correctly turn on VIRTIO_F_IOMMU_PLATFORM +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Jason Wang + +We turn on device IOTLB via VIRTIO_F_IOMMU_PLATFORM unconditionally on +platform without IOMMU support. This can lead unnecessary IOTLB +transactions which will damage the performance. + +Fixing this by check whether the device is backed by IOMMU and disable +device IOTLB. + +Reported-by: Halil Pasic +Tested-by: Halil Pasic +Reviewed-by: Halil Pasic +Signed-off-by: Jason Wang +Message-Id: <20200302042454.24814-1-jasowang@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit f7ef7e6e3ba6e994e070cc609eb154339d1c4a11) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/vhost.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 9edfadc81d..9182a00495 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -290,7 +290,14 @@ static int vhost_dev_has_iommu(struct vhost_dev *dev) + { + VirtIODevice *vdev = dev->vdev; + +- return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); ++ /* ++ * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support ++ * incremental memory mapping API via IOTLB API. For platform that ++ * does not have IOMMU, there's no need to enable this feature ++ * which may cause unnecessary IOTLB miss/update trnasactions. ++ */ ++ return vdev->dma_as != &address_space_memory && ++ virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + } + + static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, +@@ -765,6 +772,9 @@ static int vhost_dev_set_features(struct vhost_dev *dev, + if (enable_log) { + features |= 0x1ULL << VHOST_F_LOG_ALL; + } ++ if (!vhost_dev_has_iommu(dev)) { ++ features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); ++ } + r = dev->vhost_ops->vhost_set_features(dev, features); + if (r < 0) { + VHOST_OPS_DEBUG("vhost_set_features failed"); +-- +2.27.0 + diff --git a/kvm-vhost-user-Print-unexpected-slave-message-types.patch b/kvm-vhost-user-Print-unexpected-slave-message-types.patch new file mode 100755 index 0000000..e5776e7 --- /dev/null +++ b/kvm-vhost-user-Print-unexpected-slave-message-types.patch @@ -0,0 +1,48 @@ +From d6abbdaeb2c35efe6793a599c98116e250b1f179 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:43 +0100 +Subject: [PATCH 072/116] vhost-user: Print unexpected slave message types +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-69-dgilbert@redhat.com> +Patchwork-id: 93519 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 068/112] vhost-user: Print unexpected slave message types +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +When we receive an unexpected message type on the slave fd, print +the type. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0fdc465d7d5aafeae127eba488f247ac6f58df4c) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost-user.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c +index 02a9b25..e4f46ec 100644 +--- a/hw/virtio/vhost-user.c ++++ b/hw/virtio/vhost-user.c +@@ -1055,7 +1055,7 @@ static void slave_read(void *opaque) + fd[0]); + break; + default: +- error_report("Received unexpected msg type."); ++ error_report("Received unexpected msg type: %d.", hdr.request); + ret = -EINVAL; + } + +-- +1.8.3.1 + diff --git a/kvm-vhost-user-fs-remove-vhostfd-property.patch b/kvm-vhost-user-fs-remove-vhostfd-property.patch new file mode 100755 index 0000000..5904e82 --- /dev/null +++ b/kvm-vhost-user-fs-remove-vhostfd-property.patch @@ -0,0 +1,59 @@ +From 912af6f7c270e2939a91c9b3f62b6ba1202edc43 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:37 +0100 +Subject: [PATCH 006/116] vhost-user-fs: remove "vhostfd" property +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-3-dgilbert@redhat.com> +Patchwork-id: 93458 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 002/112] vhost-user-fs: remove "vhostfd" property +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Marc-André Lureau + +The property doesn't make much sense for a vhost-user device. + +Signed-off-by: Marc-André Lureau +Message-Id: <20191116112016.14872-1-marcandre.lureau@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 703857348724319735d9be7b5b996e6445c6e6b9) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost-user-fs.c | 1 - + include/hw/virtio/vhost-user-fs.h | 1 - + 2 files changed, 2 deletions(-) + +diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c +index f0df7f4..ca0b7fc 100644 +--- a/hw/virtio/vhost-user-fs.c ++++ b/hw/virtio/vhost-user-fs.c +@@ -263,7 +263,6 @@ static Property vuf_properties[] = { + DEFINE_PROP_UINT16("num-request-queues", VHostUserFS, + conf.num_request_queues, 1), + DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128), +- DEFINE_PROP_STRING("vhostfd", VHostUserFS, conf.vhostfd), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h +index 539885b..9ff1bdb 100644 +--- a/include/hw/virtio/vhost-user-fs.h ++++ b/include/hw/virtio/vhost-user-fs.h +@@ -28,7 +28,6 @@ typedef struct { + char *tag; + uint16_t num_request_queues; + uint16_t queue_size; +- char *vhostfd; + } VHostUserFSConf; + + typedef struct { +-- +1.8.3.1 + diff --git a/kvm-vhost-user-gpu-Drop-trailing-json-comma.patch b/kvm-vhost-user-gpu-Drop-trailing-json-comma.patch new file mode 100755 index 0000000..3a50632 --- /dev/null +++ b/kvm-vhost-user-gpu-Drop-trailing-json-comma.patch @@ -0,0 +1,52 @@ +From 044feb40e3041759ee77d08136f334cf3ad67c1e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=A1n=20Tomko?= +Date: Fri, 21 Feb 2020 09:49:23 +0000 +Subject: [PATCH] vhost-user-gpu: Drop trailing json comma +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Ján Tomko +Message-id: <07fed9a38495938a7180819e27f590d80cd6668d.1582278173.git.jtomko@redhat.com> +Patchwork-id: 94019 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] vhost-user-gpu: Drop trailing json comma +Bugzilla: 1805334 +RH-Acked-by: Marc-André Lureau +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Stefan Hajnoczi + +From: Cole Robinson + +Trailing comma is not valid json: + +$ cat contrib/vhost-user-gpu/50-qemu-gpu.json.in | jq +parse error: Expected another key-value pair at line 5, column 1 + +Signed-off-by: Cole Robinson +Reviewed-by: Marc-André Lureau +Reviewed-by: Li Qiang +Reviewed-by: Philippe Mathieu-Daudé +Message-id: 7f5dd2ac9f3504e2699f23e69bc3d8051b729832.1568925097.git.crobinso@redhat.com +Signed-off-by: Gerd Hoffmann +(cherry picked from commit ca26b032e5a0e8a190c763ce828a8740d24b9b65) +Signed-off-by: Ján Tomko +Signed-off-by: Danilo C. L. de Paula +--- + contrib/vhost-user-gpu/50-qemu-gpu.json.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/contrib/vhost-user-gpu/50-qemu-gpu.json.in b/contrib/vhost-user-gpu/50-qemu-gpu.json.in +index 658b545..f5edd09 100644 +--- a/contrib/vhost-user-gpu/50-qemu-gpu.json.in ++++ b/contrib/vhost-user-gpu/50-qemu-gpu.json.in +@@ -1,5 +1,5 @@ + { + "description": "QEMU vhost-user-gpu", + "type": "gpu", +- "binary": "@libexecdir@/vhost-user-gpu", ++ "binary": "@libexecdir@/vhost-user-gpu" + } +-- +1.8.3.1 + diff --git a/kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch b/kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch new file mode 100755 index 0000000..ed10701 --- /dev/null +++ b/kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch @@ -0,0 +1,80 @@ +From b395ad369278d0923a590975fabbb99ec7716c6b Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:28 +0000 +Subject: [PATCH 4/7] virtio: add ability to delete vq through a pointer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-2-jusual@redhat.com> +Patchwork-id: 93980 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/4] virtio: add ability to delete vq through a pointer +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: "Michael S. Tsirkin" + +Devices tend to maintain vq pointers, allow deleting them trough a vq pointer. + +Signed-off-by: Michael S. Tsirkin +Reviewed-by: David Hildenbrand +Reviewed-by: David Hildenbrand +(cherry picked from commit 722f8c51d8af223751dfb1d02de40043e8ba067e) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/virtio.c | 15 ++++++++++----- + include/hw/virtio/virtio.h | 2 ++ + 2 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index 3211135..d63a369 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -2335,17 +2335,22 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + return &vdev->vq[i]; + } + ++void virtio_delete_queue(VirtQueue *vq) ++{ ++ vq->vring.num = 0; ++ vq->vring.num_default = 0; ++ vq->handle_output = NULL; ++ vq->handle_aio_output = NULL; ++ g_free(vq->used_elems); ++} ++ + void virtio_del_queue(VirtIODevice *vdev, int n) + { + if (n < 0 || n >= VIRTIO_QUEUE_MAX) { + abort(); + } + +- vdev->vq[n].vring.num = 0; +- vdev->vq[n].vring.num_default = 0; +- vdev->vq[n].handle_output = NULL; +- vdev->vq[n].handle_aio_output = NULL; +- g_free(vdev->vq[n].used_elems); ++ virtio_delete_queue(&vdev->vq[n]); + } + + static void virtio_set_isr(VirtIODevice *vdev, int value) +diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h +index 6a20442..91167f6 100644 +--- a/include/hw/virtio/virtio.h ++++ b/include/hw/virtio/virtio.h +@@ -183,6 +183,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + + void virtio_del_queue(VirtIODevice *vdev, int n); + ++void virtio_delete_queue(VirtQueue *vq); ++ + void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len); + void virtqueue_flush(VirtQueue *vq, unsigned int count); +-- +1.8.3.1 + diff --git a/kvm-virtio-add-vhost-user-fs-ccw-device.patch b/kvm-virtio-add-vhost-user-fs-ccw-device.patch new file mode 100755 index 0000000..d7d41af --- /dev/null +++ b/kvm-virtio-add-vhost-user-fs-ccw-device.patch @@ -0,0 +1,136 @@ +From fc5d5887462da813d91a3a0649214313d580d7af Mon Sep 17 00:00:00 2001 +From: Claudio Imbrenda +Date: Tue, 27 Oct 2020 12:02:16 -0400 +Subject: [PATCH 03/18] virtio: add vhost-user-fs-ccw device + +RH-Author: Claudio Imbrenda +Message-id: <20201027120217.2997314-3-cimbrend@redhat.com> +Patchwork-id: 98720 +O-Subject: [RHEL8.4 qemu-kvm PATCH 2/3] virtio: add vhost-user-fs-ccw device +Bugzilla: 1857733 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Cornelia Huck + +From: Halil Pasic + +upstream bd0bbb9aba2afbc2ea24b0475be04f795468b381 + +fixed for the backport: +* makefile logic instead of meson +* old style qdev initialization +* old style device class properties + +-- + +Wire up the CCW device for vhost-user-fs. + +Reviewed-by: Cornelia Huck +Signed-off-by: Halil Pasic +Message-id: 20200901150019.29229-2-mhartmay@linux.ibm.com +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/Makefile.objs | 1 + + hw/s390x/vhost-user-fs-ccw.c | 76 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 77 insertions(+) + create mode 100644 hw/s390x/vhost-user-fs-ccw.c + +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index a46a1c7894e..c4086ec3171 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -20,6 +20,7 @@ obj-$(CONFIG_VIRTIO_NET) += virtio-ccw-net.o + obj-$(CONFIG_VIRTIO_BLK) += virtio-ccw-blk.o + obj-$(call land,$(CONFIG_VIRTIO_9P),$(CONFIG_VIRTFS)) += virtio-ccw-9p.o + obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-ccw.o ++obj-$(CONFIG_VHOST_USER_FS) += vhost-user-fs-ccw.o + endif + obj-y += css-bridge.o + obj-y += ccw-device.o +diff --git a/hw/s390x/vhost-user-fs-ccw.c b/hw/s390x/vhost-user-fs-ccw.c +new file mode 100644 +index 00000000000..e7b165d5f61 +--- /dev/null ++++ b/hw/s390x/vhost-user-fs-ccw.c +@@ -0,0 +1,76 @@ ++/* ++ * virtio ccw vhost-user-fs implementation ++ * ++ * Copyright 2020 IBM Corp. ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++#include "qemu/osdep.h" ++#include "hw/qdev-properties.h" ++#include "qapi/error.h" ++#include "hw/virtio/vhost-user-fs.h" ++#include "virtio-ccw.h" ++ ++typedef struct VHostUserFSCcw { ++ VirtioCcwDevice parent_obj; ++ VHostUserFS vdev; ++} VHostUserFSCcw; ++ ++#define TYPE_VHOST_USER_FS_CCW "vhost-user-fs-ccw" ++#define VHOST_USER_FS_CCW(obj) \ ++ OBJECT_CHECK(VHostUserFSCcw, (obj), TYPE_VHOST_USER_FS_CCW) ++ ++ ++static Property vhost_user_fs_ccw_properties[] = { ++ DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags, ++ VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true), ++ DEFINE_PROP_UINT32("max_revision", VirtioCcwDevice, max_rev, ++ VIRTIO_CCW_MAX_REV), ++ DEFINE_PROP_END_OF_LIST(), ++}; ++ ++static void vhost_user_fs_ccw_realize(VirtioCcwDevice *ccw_dev, Error **errp) ++{ ++ VHostUserFSCcw *dev = VHOST_USER_FS_CCW(ccw_dev); ++ DeviceState *vdev = DEVICE(&dev->vdev); ++ ++ qdev_set_parent_bus(vdev, BUS(&ccw_dev->bus)); ++ object_property_set_bool(OBJECT(vdev), true, "realized", errp); ++} ++ ++static void vhost_user_fs_ccw_instance_init(Object *obj) ++{ ++ VHostUserFSCcw *dev = VHOST_USER_FS_CCW(obj); ++ VirtioCcwDevice *ccw_dev = VIRTIO_CCW_DEVICE(obj); ++ ++ ccw_dev->force_revision_1 = true; ++ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), ++ TYPE_VHOST_USER_FS); ++} ++ ++static void vhost_user_fs_ccw_class_init(ObjectClass *klass, void *data) ++{ ++ DeviceClass *dc = DEVICE_CLASS(klass); ++ VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass); ++ ++ k->realize = vhost_user_fs_ccw_realize; ++ dc->props = vhost_user_fs_ccw_properties; ++ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); ++} ++ ++static const TypeInfo vhost_user_fs_ccw = { ++ .name = TYPE_VHOST_USER_FS_CCW, ++ .parent = TYPE_VIRTIO_CCW_DEVICE, ++ .instance_size = sizeof(VHostUserFSCcw), ++ .instance_init = vhost_user_fs_ccw_instance_init, ++ .class_init = vhost_user_fs_ccw_class_init, ++}; ++ ++static void vhost_user_fs_ccw_register(void) ++{ ++ type_register_static(&vhost_user_fs_ccw); ++} ++ ++type_init(vhost_user_fs_ccw_register) +-- +2.27.0 + diff --git a/kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch b/kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch new file mode 100755 index 0000000..9e46be1 --- /dev/null +++ b/kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch @@ -0,0 +1,203 @@ +From fdd1f3bf672ad8bb0a6db896ec8cbc797c31da1f Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Wed, 24 Jun 2020 13:24:53 -0400 +Subject: [PATCH 11/12] virtio-blk: On restart, process queued requests in the + proper context + +RH-Author: Sergio Lopez Pascual +Message-id: <20200624132453.111276-3-slp@redhat.com> +Patchwork-id: 97798 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] virtio-blk: On restart, process queued requests in the proper context +Bugzilla: +RH-Acked-by: John Snow +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Kevin Wolf + +On restart, we were scheduling a BH to process queued requests, which +would run before starting up the data plane, leading to those requests +being assigned and started on coroutines on the main context. + +This could cause requests to be wrongly processed in parallel from +different threads (the main thread and the iothread managing the data +plane), potentially leading to multiple issues. + +For example, stopping and resuming a VM multiple times while the guest +is generating I/O on a virtio_blk device can trigger a crash with a +stack tracing looking like this one: + +<------> + Thread 2 (Thread 0x7ff736765700 (LWP 1062503)): + #0 0x00005567a13b99d6 in iov_memset + (iov=0x6563617073206f4e, iov_cnt=1717922848, offset=516096, fillc=0, bytes=7018105756081554803) + at util/iov.c:69 + #1 0x00005567a13bab73 in qemu_iovec_memset + (qiov=0x7ff73ec99748, offset=516096, fillc=0, bytes=7018105756081554803) at util/iov.c:530 + #2 0x00005567a12f411c in qemu_laio_process_completion (laiocb=0x7ff6512ee6c0) at block/linux-aio.c:86 + #3 0x00005567a12f42ff in qemu_laio_process_completions (s=0x7ff7182e8420) at block/linux-aio.c:217 + #4 0x00005567a12f480d in ioq_submit (s=0x7ff7182e8420) at block/linux-aio.c:323 + #5 0x00005567a12f43d9 in qemu_laio_process_completions_and_submit (s=0x7ff7182e8420) + at block/linux-aio.c:236 + #6 0x00005567a12f44c2 in qemu_laio_poll_cb (opaque=0x7ff7182e8430) at block/linux-aio.c:267 + #7 0x00005567a13aed83 in run_poll_handlers_once (ctx=0x5567a2b58c70, timeout=0x7ff7367645f8) + at util/aio-posix.c:520 + #8 0x00005567a13aee9f in run_poll_handlers (ctx=0x5567a2b58c70, max_ns=16000, timeout=0x7ff7367645f8) + at util/aio-posix.c:562 + #9 0x00005567a13aefde in try_poll_mode (ctx=0x5567a2b58c70, timeout=0x7ff7367645f8) + at util/aio-posix.c:597 + #10 0x00005567a13af115 in aio_poll (ctx=0x5567a2b58c70, blocking=true) at util/aio-posix.c:639 + #11 0x00005567a109acca in iothread_run (opaque=0x5567a2b29760) at iothread.c:75 + #12 0x00005567a13b2790 in qemu_thread_start (args=0x5567a2b694c0) at util/qemu-thread-posix.c:519 + #13 0x00007ff73eedf2de in start_thread () at /lib64/libpthread.so.0 + #14 0x00007ff73ec10e83 in clone () at /lib64/libc.so.6 + + Thread 1 (Thread 0x7ff743986f00 (LWP 1062500)): + #0 0x00005567a13b99d6 in iov_memset + (iov=0x6563617073206f4e, iov_cnt=1717922848, offset=516096, fillc=0, bytes=7018105756081554803) + at util/iov.c:69 + #1 0x00005567a13bab73 in qemu_iovec_memset + (qiov=0x7ff73ec99748, offset=516096, fillc=0, bytes=7018105756081554803) at util/iov.c:530 + #2 0x00005567a12f411c in qemu_laio_process_completion (laiocb=0x7ff6512ee6c0) at block/linux-aio.c:86 + #3 0x00005567a12f42ff in qemu_laio_process_completions (s=0x7ff7182e8420) at block/linux-aio.c:217 + #4 0x00005567a12f480d in ioq_submit (s=0x7ff7182e8420) at block/linux-aio.c:323 + #5 0x00005567a12f4a2f in laio_do_submit (fd=19, laiocb=0x7ff5f4ff9ae0, offset=472363008, type=2) + at block/linux-aio.c:375 + #6 0x00005567a12f4af2 in laio_co_submit + (bs=0x5567a2b8c460, s=0x7ff7182e8420, fd=19, offset=472363008, qiov=0x7ff5f4ff9ca0, type=2) + at block/linux-aio.c:394 + #7 0x00005567a12f1803 in raw_co_prw + (bs=0x5567a2b8c460, offset=472363008, bytes=20480, qiov=0x7ff5f4ff9ca0, type=2) + at block/file-posix.c:1892 + #8 0x00005567a12f1941 in raw_co_pwritev + (bs=0x5567a2b8c460, offset=472363008, bytes=20480, qiov=0x7ff5f4ff9ca0, flags=0) + at block/file-posix.c:1925 + #9 0x00005567a12fe3e1 in bdrv_driver_pwritev + (bs=0x5567a2b8c460, offset=472363008, bytes=20480, qiov=0x7ff5f4ff9ca0, qiov_offset=0, flags=0) + at block/io.c:1183 + #10 0x00005567a1300340 in bdrv_aligned_pwritev + (child=0x5567a2b5b070, req=0x7ff5f4ff9db0, offset=472363008, bytes=20480, align=512, qiov=0x7ff72c0425b8, qiov_offset=0, flags=0) at block/io.c:1980 + #11 0x00005567a1300b29 in bdrv_co_pwritev_part + (child=0x5567a2b5b070, offset=472363008, bytes=20480, qiov=0x7ff72c0425b8, qiov_offset=0, flags=0) + at block/io.c:2137 + #12 0x00005567a12baba1 in qcow2_co_pwritev_task + (bs=0x5567a2b92740, file_cluster_offset=472317952, offset=487305216, bytes=20480, qiov=0x7ff72c0425b8, qiov_offset=0, l2meta=0x0) at block/qcow2.c:2444 + #13 0x00005567a12bacdb in qcow2_co_pwritev_task_entry (task=0x5567a2b48540) at block/qcow2.c:2475 + #14 0x00005567a13167d8 in aio_task_co (opaque=0x5567a2b48540) at block/aio_task.c:45 + #15 0x00005567a13cf00c in coroutine_trampoline (i0=738245600, i1=32759) at util/coroutine-ucontext.c:115 + #16 0x00007ff73eb622e0 in __start_context () at /lib64/libc.so.6 + #17 0x00007ff6626f1350 in () + #18 0x0000000000000000 in () +<------> + +This is also known to cause crashes with this message (assertion +failed): + + aio_co_schedule: Co-routine was already scheduled in 'aio_co_schedule' + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1812765 +Signed-off-by: Sergio Lopez +Message-Id: <20200603093240.40489-3-slp@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 49b44549ace7890fffdf027fd3695218ee7f1121) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + hw/block/dataplane/virtio-blk.c | 8 ++++++++ + hw/block/virtio-blk.c | 18 ++++++++++++------ + include/hw/virtio/virtio-blk.h | 2 +- + 3 files changed, 21 insertions(+), 7 deletions(-) + +diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c +index 119906a5fe..ac495fd72a 100644 +--- a/hw/block/dataplane/virtio-blk.c ++++ b/hw/block/dataplane/virtio-blk.c +@@ -220,6 +220,9 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) + goto fail_guest_notifiers; + } + ++ /* Process queued requests before the ones in vring */ ++ virtio_blk_process_queued_requests(vblk, false); ++ + /* Kick right away to begin processing requests already in vring */ + for (i = 0; i < nvqs; i++) { + VirtQueue *vq = virtio_get_queue(s->vdev, i); +@@ -239,6 +242,11 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) + return 0; + + fail_guest_notifiers: ++ /* ++ * If we failed to set up the guest notifiers queued requests will be ++ * processed on the main context. ++ */ ++ virtio_blk_process_queued_requests(vblk, false); + vblk->dataplane_disabled = true; + s->starting = false; + vblk->dataplane_started = true; +diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c +index 6ff29a05d6..493a263fa6 100644 +--- a/hw/block/virtio-blk.c ++++ b/hw/block/virtio-blk.c +@@ -819,7 +819,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) + virtio_blk_handle_output_do(s, vq); + } + +-void virtio_blk_process_queued_requests(VirtIOBlock *s) ++void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh) + { + VirtIOBlockReq *req = s->rq; + MultiReqBuffer mrb = {}; +@@ -847,7 +847,9 @@ void virtio_blk_process_queued_requests(VirtIOBlock *s) + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s->blk, &mrb); + } +- blk_dec_in_flight(s->conf.conf.blk); ++ if (is_bh) { ++ blk_dec_in_flight(s->conf.conf.blk); ++ } + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + } + +@@ -858,21 +860,25 @@ static void virtio_blk_dma_restart_bh(void *opaque) + qemu_bh_delete(s->bh); + s->bh = NULL; + +- virtio_blk_process_queued_requests(s); ++ virtio_blk_process_queued_requests(s, true); + } + + static void virtio_blk_dma_restart_cb(void *opaque, int running, + RunState state) + { + VirtIOBlock *s = opaque; ++ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s))); ++ VirtioBusState *bus = VIRTIO_BUS(qbus); + + if (!running) { + return; + } + +- if (!s->bh) { +- /* FIXME The data plane is not started yet, so these requests are +- * processed in the main thread. */ ++ /* ++ * If ioeventfd is enabled, don't schedule the BH here as queued ++ * requests will be processed while starting the data plane. ++ */ ++ if (!s->bh && !virtio_bus_ioeventfd_enabled(bus)) { + s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk), + virtio_blk_dma_restart_bh, s); + blk_inc_in_flight(s->conf.conf.blk); +diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h +index cf8eea2f58..e77f0db3b0 100644 +--- a/include/hw/virtio/virtio-blk.h ++++ b/include/hw/virtio/virtio-blk.h +@@ -84,6 +84,6 @@ typedef struct MultiReqBuffer { + } MultiReqBuffer; + + bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); +-void virtio_blk_process_queued_requests(VirtIOBlock *s); ++void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh); + + #endif +-- +2.27.0 + diff --git a/kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch b/kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch new file mode 100755 index 0000000..148045d --- /dev/null +++ b/kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch @@ -0,0 +1,83 @@ +From 73d83d8880e85eedc22c9651b67d1eacd5de5ff4 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Wed, 24 Jun 2020 13:24:52 -0400 +Subject: [PATCH 10/12] virtio-blk: Refactor the code that processes queued + requests + +RH-Author: Sergio Lopez Pascual +Message-id: <20200624132453.111276-2-slp@redhat.com> +Patchwork-id: 97797 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] virtio-blk: Refactor the code that processes queued requests +Bugzilla: +RH-Acked-by: John Snow +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Kevin Wolf + +Move the code that processes queued requests from +virtio_blk_dma_restart_bh() to its own, non-static, function. This +will allow us to call it from the virtio_blk_data_plane_start() in a +future patch. + +Signed-off-by: Sergio Lopez +Message-Id: <20200603093240.40489-2-slp@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7aa1c247b466870b0704d3ccdc3755e5e7394dca) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + hw/block/virtio-blk.c | 16 +++++++++++----- + include/hw/virtio/virtio-blk.h | 1 + + 2 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c +index c4e55fb3de..6ff29a05d6 100644 +--- a/hw/block/virtio-blk.c ++++ b/hw/block/virtio-blk.c +@@ -819,15 +819,11 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) + virtio_blk_handle_output_do(s, vq); + } + +-static void virtio_blk_dma_restart_bh(void *opaque) ++void virtio_blk_process_queued_requests(VirtIOBlock *s) + { +- VirtIOBlock *s = opaque; + VirtIOBlockReq *req = s->rq; + MultiReqBuffer mrb = {}; + +- qemu_bh_delete(s->bh); +- s->bh = NULL; +- + s->rq = NULL; + + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); +@@ -855,6 +851,16 @@ static void virtio_blk_dma_restart_bh(void *opaque) + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + } + ++static void virtio_blk_dma_restart_bh(void *opaque) ++{ ++ VirtIOBlock *s = opaque; ++ ++ qemu_bh_delete(s->bh); ++ s->bh = NULL; ++ ++ virtio_blk_process_queued_requests(s); ++} ++ + static void virtio_blk_dma_restart_cb(void *opaque, int running, + RunState state) + { +diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h +index cddcfbebe9..cf8eea2f58 100644 +--- a/include/hw/virtio/virtio-blk.h ++++ b/include/hw/virtio/virtio-blk.h +@@ -84,5 +84,6 @@ typedef struct MultiReqBuffer { + } MultiReqBuffer; + + bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); ++void virtio_blk_process_queued_requests(VirtIOBlock *s); + + #endif +-- +2.27.0 + diff --git a/kvm-virtio-don-t-enable-notifications-during-polling.patch b/kvm-virtio-don-t-enable-notifications-during-polling.patch new file mode 100755 index 0000000..2dffc01 --- /dev/null +++ b/kvm-virtio-don-t-enable-notifications-during-polling.patch @@ -0,0 +1,158 @@ +From 351dd07d7b5e69cdf47260c9ea848c0c93cd2c8a Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 9 Jan 2020 11:13:25 +0000 +Subject: [PATCH 3/5] virtio: don't enable notifications during polling + +RH-Author: Stefan Hajnoczi +Message-id: <20200109111325.559557-2-stefanha@redhat.com> +Patchwork-id: 93298 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] virtio: don't enable notifications during polling +Bugzilla: 1789301 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Michael S. Tsirkin + +Virtqueue notifications are not necessary during polling, so we disable +them. This allows the guest driver to avoid MMIO vmexits. +Unfortunately the virtio-blk and virtio-scsi handler functions re-enable +notifications, defeating this optimization. + +Fix virtio-blk and virtio-scsi emulation so they leave notifications +disabled. The key thing to remember for correctness is that polling +always checks one last time after ending its loop, therefore it's safe +to lose the race when re-enabling notifications at the end of polling. + +There is a measurable performance improvement of 5-10% with the null-co +block driver. Real-life storage configurations will see a smaller +improvement because the MMIO vmexit overhead contributes less to +latency. + +Signed-off-by: Stefan Hajnoczi +Message-Id: <20191209210957.65087-1-stefanha@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit d0435bc513e23a4961b6af20164d1c6c219eb4ea) +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Danilo C. L. de Paula +--- + hw/block/virtio-blk.c | 9 +++++++-- + hw/scsi/virtio-scsi.c | 9 +++++++-- + hw/virtio/virtio.c | 12 ++++++------ + include/hw/virtio/virtio.h | 1 + + 4 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c +index 4c357d2..c4e55fb 100644 +--- a/hw/block/virtio-blk.c ++++ b/hw/block/virtio-blk.c +@@ -764,13 +764,16 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) + { + VirtIOBlockReq *req; + MultiReqBuffer mrb = {}; ++ bool suppress_notifications = virtio_queue_get_notification(vq); + bool progress = false; + + aio_context_acquire(blk_get_aio_context(s->blk)); + blk_io_plug(s->blk); + + do { +- virtio_queue_set_notification(vq, 0); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 0); ++ } + + while ((req = virtio_blk_get_request(s, vq))) { + progress = true; +@@ -781,7 +784,9 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) + } + } + +- virtio_queue_set_notification(vq, 1); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 1); ++ } + } while (!virtio_queue_empty(vq)); + + if (mrb.num_reqs) { +diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c +index 54108c0..e2cd1df 100644 +--- a/hw/scsi/virtio-scsi.c ++++ b/hw/scsi/virtio-scsi.c +@@ -597,12 +597,15 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) + { + VirtIOSCSIReq *req, *next; + int ret = 0; ++ bool suppress_notifications = virtio_queue_get_notification(vq); + bool progress = false; + + QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); + + do { +- virtio_queue_set_notification(vq, 0); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 0); ++ } + + while ((req = virtio_scsi_pop_req(s, vq))) { + progress = true; +@@ -622,7 +625,9 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) + } + } + +- virtio_queue_set_notification(vq, 1); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 1); ++ } + } while (ret != -EINVAL && !virtio_queue_empty(vq)); + + QTAILQ_FOREACH_SAFE(req, &reqs, next, next) { +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index 04716b5..3211135 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -432,6 +432,11 @@ static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable) + } + } + ++bool virtio_queue_get_notification(VirtQueue *vq) ++{ ++ return vq->notification; ++} ++ + void virtio_queue_set_notification(VirtQueue *vq, int enable) + { + vq->notification = enable; +@@ -3384,17 +3389,12 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque) + { + EventNotifier *n = opaque; + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); +- bool progress; + + if (!vq->vring.desc || virtio_queue_empty(vq)) { + return false; + } + +- progress = virtio_queue_notify_aio_vq(vq); +- +- /* In case the handler function re-enabled notifications */ +- virtio_queue_set_notification(vq, 0); +- return progress; ++ return virtio_queue_notify_aio_vq(vq); + } + + static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) +diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h +index c32a815..6a20442 100644 +--- a/include/hw/virtio/virtio.h ++++ b/include/hw/virtio/virtio.h +@@ -224,6 +224,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id); + + void virtio_notify_config(VirtIODevice *vdev); + ++bool virtio_queue_get_notification(VirtQueue *vq); + void virtio_queue_set_notification(VirtQueue *vq, int enable); + + int virtio_queue_ready(VirtQueue *vq); +-- +1.8.3.1 + diff --git a/kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch b/kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch new file mode 100755 index 0000000..9a69ed1 --- /dev/null +++ b/kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch @@ -0,0 +1,60 @@ +From c0cf6d8a1d3b9bf3928f37fcfd5aa8ae6f1338ca Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:36 +0100 +Subject: [PATCH 005/116] virtio-fs: fix MSI-X nvectors calculation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-2-dgilbert@redhat.com> +Patchwork-id: 93455 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 001/112] virtio-fs: fix MSI-X nvectors calculation +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +The following MSI-X vectors are required: + * VIRTIO Configuration Change + * hiprio virtqueue + * requests virtqueues + +Fix the calculation to reserve enough MSI-X vectors. Otherwise guest +drivers fall back to a sub-optional configuration where all virtqueues +share a single vector. + +This change does not break live migration compatibility since +vhost-user-fs-pci devices are not migratable yet. + +Reported-by: Vivek Goyal +Signed-off-by: Stefan Hajnoczi +Message-Id: <20191209110759.35227-1-stefanha@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 366844f3d1329c6423dd752891a28ccb3ee8fddd) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost-user-fs-pci.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c +index 933a3f2..e3a649d 100644 +--- a/hw/virtio/vhost-user-fs-pci.c ++++ b/hw/virtio/vhost-user-fs-pci.c +@@ -40,7 +40,8 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) + DeviceState *vdev = DEVICE(&dev->vdev); + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { +- vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 1; ++ /* Also reserve config change and hiprio queue vectors */ ++ vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 2; + } + + qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); +-- +1.8.3.1 + diff --git a/kvm-virtio-make-virtio_delete_queue-idempotent.patch b/kvm-virtio-make-virtio_delete_queue-idempotent.patch new file mode 100755 index 0000000..16eb1da --- /dev/null +++ b/kvm-virtio-make-virtio_delete_queue-idempotent.patch @@ -0,0 +1,42 @@ +From 901e65fa6ccbadeacd6c585cf49a0a7cdafb4737 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:29 +0000 +Subject: [PATCH 5/7] virtio: make virtio_delete_queue idempotent + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-3-jusual@redhat.com> +Patchwork-id: 93981 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/4] virtio: make virtio_delete_queue idempotent +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: "Michael S. Tsirkin" + +Let's make sure calling this twice is harmless - +no known instances, but seems safer. + +Suggested-by: Pan Nengyuan +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 8cd353ea0fbf0e334e015d833f612799be642296) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/virtio.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index d63a369..e6a9ba4 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -2342,6 +2342,7 @@ void virtio_delete_queue(VirtQueue *vq) + vq->handle_output = NULL; + vq->handle_aio_output = NULL; + g_free(vq->used_elems); ++ vq->used_elems = NULL; + } + + void virtio_del_queue(VirtIODevice *vdev, int n) +-- +1.8.3.1 + diff --git a/kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch b/kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch new file mode 100755 index 0000000..c21c699 --- /dev/null +++ b/kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch @@ -0,0 +1,49 @@ +From 2f494c41715193522c52eafc6af2a5e33f88ceb9 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:31 +0000 +Subject: [PATCH 7/7] virtio-net: delete also control queue when TX/RX deleted + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-5-jusual@redhat.com> +Patchwork-id: 93983 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/4] virtio-net: delete also control queue when TX/RX deleted +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: Yuri Benditovich + +https://bugzilla.redhat.com/show_bug.cgi?id=1708480 +If the control queue is not deleted together with TX/RX, it +later will be ignored in freeing cache resources and hot +unplug will not be completed. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Yuri Benditovich +Message-Id: <20191226043649.14481-3-yuri.benditovich@daynix.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit d945d9f1731244ef341f74ede93120fc9de35913) +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/virtio-net.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index db3d7c3..f325440 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -3101,7 +3101,8 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) + for (i = 0; i < max_queues; i++) { + virtio_net_del_queue(n, i); + } +- ++ /* delete also control vq */ ++ virtio_del_queue(vdev, max_queues * 2); + qemu_announce_timer_del(&n->announce_timer, false); + g_free(n->vqs); + qemu_del_nic(n->nic); +-- +1.8.3.1 + diff --git a/kvm-virtio-net-fix-removal-of-failover-device.patch b/kvm-virtio-net-fix-removal-of-failover-device.patch new file mode 100755 index 0000000..6044f3d --- /dev/null +++ b/kvm-virtio-net-fix-removal-of-failover-device.patch @@ -0,0 +1,52 @@ +From 92fb4f6cdde32652352a0a831a2ba815701a4014 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Fri, 3 Jul 2020 12:37:05 -0400 +Subject: [PATCH 4/4] virtio-net: fix removal of failover device + +RH-Author: Juan Quintela +Message-id: <20200703123705.7175-2-quintela@redhat.com> +Patchwork-id: 97901 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] virtio-net: fix removal of failover device +Bugzilla: +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Laurent Vivier +RH-Acked-by: Dr. David Alan Gilbert + +If you have a networking device and its virtio failover device, and +you remove them in this order: +- virtio device +- the real device + +You get qemu crash. +See bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1820120 + +Bug exist on qemu 4.2 and 5.0. +But in 5.0 don't shows because commit +77b06bba62034a87cc61a9c8de1309ae3e527d97 + +somehow papers over it. + +CC: Jason Wang +CC: Michael S. Tsirkin + +Signed-off-by: Juan Quintela +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/virtio-net.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index f325440d01..dabeb9e720 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -3091,6 +3091,7 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) + g_free(n->vlans); + + if (n->failover) { ++ device_listener_unregister(&n->primary_listener); + g_free(n->primary_device_id); + g_free(n->standby_id); + qobject_unref(n->primary_device_dict); +-- +2.27.0 + diff --git a/kvm-virtio-reset-region-cache-when-on-queue-deletion.patch b/kvm-virtio-reset-region-cache-when-on-queue-deletion.patch new file mode 100755 index 0000000..c9f1086 --- /dev/null +++ b/kvm-virtio-reset-region-cache-when-on-queue-deletion.patch @@ -0,0 +1,46 @@ +From 8bf4f561262d9282cebdb3418cdb9a69c92216a0 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:30 +0000 +Subject: [PATCH 6/7] virtio: reset region cache when on queue deletion + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-4-jusual@redhat.com> +Patchwork-id: 93982 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/4] virtio: reset region cache when on queue deletion +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: Yuri Benditovich + +https://bugzilla.redhat.com/show_bug.cgi?id=1708480 +Fix leak of region reference that prevents complete +device deletion on hot unplug. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Yuri Benditovich +Message-Id: <20191226043649.14481-2-yuri.benditovich@daynix.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 421afd2fe8dd4603216cbf36081877c391f5a2a4) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/virtio.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index e6a9ba4..f644d9a 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -2343,6 +2343,7 @@ void virtio_delete_queue(VirtQueue *vq) + vq->handle_aio_output = NULL; + g_free(vq->used_elems); + vq->used_elems = NULL; ++ virtio_virtqueue_reset_region_cache(vq); + } + + void virtio_del_queue(VirtIODevice *vdev, int n) +-- +1.8.3.1 + diff --git a/kvm-virtiofs-Add-maintainers-entry.patch b/kvm-virtiofs-Add-maintainers-entry.patch new file mode 100755 index 0000000..fec9371 --- /dev/null +++ b/kvm-virtiofs-Add-maintainers-entry.patch @@ -0,0 +1,52 @@ +From f4144443eacceb04823ee72cb2d4f9f841f05495 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:11 +0100 +Subject: [PATCH 040/116] virtiofs: Add maintainers entry +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-37-dgilbert@redhat.com> +Patchwork-id: 93491 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 036/112] virtiofs: Add maintainers entry +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit bad7d2c3ad1af9344df035aedaf8e0967a543070) +Signed-off-by: Miroslav Rezanina +--- + MAINTAINERS | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/MAINTAINERS b/MAINTAINERS +index 5e5e3e5..d1b3e26 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1575,6 +1575,14 @@ T: git https://github.com/cohuck/qemu.git s390-next + T: git https://github.com/borntraeger/qemu.git s390-next + L: qemu-s390x@nongnu.org + ++virtiofs ++M: Dr. David Alan Gilbert ++M: Stefan Hajnoczi ++S: Supported ++F: tools/virtiofsd/* ++F: hw/virtio/vhost-user-fs* ++F: include/hw/virtio/vhost-user-fs.h ++ + virtio-input + M: Gerd Hoffmann + S: Maintained +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch b/kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch new file mode 100755 index 0000000..a2b91be --- /dev/null +++ b/kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch @@ -0,0 +1,86 @@ +From 4d9106acfd7ed9e4d197ddf9f22b79ba6c8afdd8 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:38 +0100 +Subject: [PATCH 067/116] virtiofsd: Add ID to the log with FUSE_LOG_DEBUG + level +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-64-dgilbert@redhat.com> +Patchwork-id: 93514 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 063/112] virtiofsd: Add ID to the log with FUSE_LOG_DEBUG level +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +virtiofsd has some threads, so we see a lot of logs with debug option. +It would be useful for debugging if we can identify the specific thread +from the log. + +Add ID, which is got by gettid(), to the log with FUSE_LOG_DEBUG level +so that we can grep the specific thread. + +The log is like as: + + ]# ./virtiofsd -d -o vhost_user_socket=/tmp/vhostqemu0 -o source=/tmp/share0 -o cache=auto + ... + [ID: 00000097] unique: 12696, success, outsize: 120 + [ID: 00000097] virtio_send_msg: elem 18: with 2 in desc of length 120 + [ID: 00000003] fv_queue_thread: Got queue event on Queue 1 + [ID: 00000003] fv_queue_thread: Queue 1 gave evalue: 1 available: in: 65552 out: 80 + [ID: 00000003] fv_queue_thread: Waiting for Queue 1 event + [ID: 00000071] fv_queue_worker: elem 33: with 2 out desc of length 80 bad_in_num=0 bad_out_num=0 + [ID: 00000071] unique: 12694, opcode: READ (15), nodeid: 2, insize: 80, pid: 2014 + [ID: 00000071] lo_read(ino=2, size=65536, off=131072) + +Signed-off-by: Masayoshi Mizuma + +Signed-off-by: Dr. David Alan Gilbert + added rework as suggested by Daniel P. Berrangé during review +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 36f3846902bd41413f6c0bf797dee509028c29f4) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ff6910f..f08324f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2268,10 +2269,17 @@ static void setup_nofile_rlimit(void) + + static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + { ++ g_autofree char *localfmt = NULL; ++ + if (current_log_level < level) { + return; + } + ++ if (current_log_level == FUSE_LOG_DEBUG) { ++ localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), fmt); ++ fmt = localfmt; ++ } ++ + if (use_syslog) { + int priority = LOG_ERR; + switch (level) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch b/kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch new file mode 100755 index 0000000..b017bf4 --- /dev/null +++ b/kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch @@ -0,0 +1,106 @@ +From 709408de33112d32b7c6675f8c9320b8bebccd58 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:05 +0100 +Subject: [PATCH 034/116] virtiofsd: Add Makefile wiring for virtiofsd contrib +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-31-dgilbert@redhat.com> +Patchwork-id: 93482 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 030/112] virtiofsd: Add Makefile wiring for virtiofsd contrib +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Wire up the building of the virtiofsd in tools. + +virtiofsd relies on Linux-specific system calls and seccomp. Anyone +wishing to port it to other host operating systems should do so +carefully and without reducing security. + +Only allow building on Linux hosts. + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Liam Merwick +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 81bfc42dcf473bc8d3790622633410da72d8e622) +Signed-off-by: Miroslav Rezanina +--- + Makefile | 10 ++++++++++ + Makefile.objs | 1 + + tools/virtiofsd/Makefile.objs | 9 +++++++++ + 3 files changed, 20 insertions(+) + create mode 100644 tools/virtiofsd/Makefile.objs + +diff --git a/Makefile b/Makefile +index 4254950..1526775 100644 +--- a/Makefile ++++ b/Makefile +@@ -330,6 +330,10 @@ endif + endif + endif + ++ifdef CONFIG_LINUX ++HELPERS-y += virtiofsd$(EXESUF) ++endif ++ + # Sphinx does not allow building manuals into the same directory as + # the source files, so if we're doing an in-tree QEMU build we must + # build the manuals into a subdirectory (and then install them from +@@ -430,6 +434,7 @@ dummy := $(call unnest-vars,, \ + elf2dmp-obj-y \ + ivshmem-client-obj-y \ + ivshmem-server-obj-y \ ++ virtiofsd-obj-y \ + rdmacm-mux-obj-y \ + libvhost-user-obj-y \ + vhost-user-scsi-obj-y \ +@@ -675,6 +680,11 @@ rdmacm-mux$(EXESUF): LIBS += "-libumad" + rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) + ++ifdef CONFIG_LINUX # relies on Linux-specific syscalls ++virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) ++ $(call LINK, $^) ++endif ++ + vhost-user-gpu$(EXESUF): $(vhost-user-gpu-obj-y) $(libvhost-user-obj-y) libqemuutil.a libqemustub.a + $(call LINK, $^) + +diff --git a/Makefile.objs b/Makefile.objs +index fcf63e1..1a8f288 100644 +--- a/Makefile.objs ++++ b/Makefile.objs +@@ -125,6 +125,7 @@ vhost-user-blk-obj-y = contrib/vhost-user-blk/ + rdmacm-mux-obj-y = contrib/rdmacm-mux/ + vhost-user-input-obj-y = contrib/vhost-user-input/ + vhost-user-gpu-obj-y = contrib/vhost-user-gpu/ ++virtiofsd-obj-y = tools/virtiofsd/ + + ###################################################################### + trace-events-subdirs = +diff --git a/tools/virtiofsd/Makefile.objs b/tools/virtiofsd/Makefile.objs +new file mode 100644 +index 0000000..45a8075 +--- /dev/null ++++ b/tools/virtiofsd/Makefile.objs +@@ -0,0 +1,9 @@ ++virtiofsd-obj-y = buffer.o \ ++ fuse_opt.o \ ++ fuse_log.o \ ++ fuse_lowlevel.o \ ++ fuse_signals.o \ ++ fuse_virtio.o \ ++ helper.o \ ++ passthrough_ll.o ++ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-auxiliary-.c-s.patch b/kvm-virtiofsd-Add-auxiliary-.c-s.patch new file mode 100755 index 0000000..90150d9 --- /dev/null +++ b/kvm-virtiofsd-Add-auxiliary-.c-s.patch @@ -0,0 +1,1387 @@ +From 55b4059d6399c212109c758190e15b574accdd07 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:41 +0100 +Subject: [PATCH 010/116] virtiofsd: Add auxiliary .c's +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-7-dgilbert@redhat.com> +Patchwork-id: 93461 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 006/112] virtiofsd: Add auxiliary .c's +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Add most of the non-main .c files we need from upstream fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ffcf8d9f8649c6e56b1193bbbc9c9f7388920043) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 321 ++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_log.c | 40 ++++ + tools/virtiofsd/fuse_opt.c | 423 +++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_signals.c | 91 +++++++++ + tools/virtiofsd/helper.c | 440 +++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 1315 insertions(+) + create mode 100644 tools/virtiofsd/buffer.c + create mode 100644 tools/virtiofsd/fuse_log.c + create mode 100644 tools/virtiofsd/fuse_opt.c + create mode 100644 tools/virtiofsd/fuse_signals.c + create mode 100644 tools/virtiofsd/helper.c + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +new file mode 100644 +index 0000000..5ab9b87 +--- /dev/null ++++ b/tools/virtiofsd/buffer.c +@@ -0,0 +1,321 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2010 Miklos Szeredi ++ ++ Functions for dealing with `struct fuse_buf` and `struct ++ fuse_bufvec`. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#define _GNU_SOURCE ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_lowlevel.h" ++#include ++#include ++#include ++#include ++ ++size_t fuse_buf_size(const struct fuse_bufvec *bufv) ++{ ++ size_t i; ++ size_t size = 0; ++ ++ for (i = 0; i < bufv->count; i++) { ++ if (bufv->buf[i].size == SIZE_MAX) ++ size = SIZE_MAX; ++ else ++ size += bufv->buf[i].size; ++ } ++ ++ return size; ++} ++ ++static size_t min_size(size_t s1, size_t s2) ++{ ++ return s1 < s2 ? s1 : s2; ++} ++ ++static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) ++{ ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (dst->flags & FUSE_BUF_FD_SEEK) { ++ res = pwrite(dst->fd, (char *)src->mem + src_off, len, ++ dst->pos + dst_off); ++ } else { ++ res = write(dst->fd, (char *)src->mem + src_off, len); ++ } ++ if (res == -1) { ++ if (!copied) ++ return -errno; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ if (!(dst->flags & FUSE_BUF_FD_RETRY)) ++ break; ++ ++ src_off += res; ++ dst_off += res; ++ len -= res; ++ } ++ ++ return copied; ++} ++ ++static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) ++{ ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (src->flags & FUSE_BUF_FD_SEEK) { ++ res = pread(src->fd, (char *)dst->mem + dst_off, len, ++ src->pos + src_off); ++ } else { ++ res = read(src->fd, (char *)dst->mem + dst_off, len); ++ } ++ if (res == -1) { ++ if (!copied) ++ return -errno; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ if (!(src->flags & FUSE_BUF_FD_RETRY)) ++ break; ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; ++} ++ ++static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) ++{ ++ char buf[4096]; ++ struct fuse_buf tmp = { ++ .size = sizeof(buf), ++ .flags = 0, ++ }; ++ ssize_t res; ++ size_t copied = 0; ++ ++ tmp.mem = buf; ++ ++ while (len) { ++ size_t this_len = min_size(tmp.size, len); ++ size_t read_len; ++ ++ res = fuse_buf_read(&tmp, 0, src, src_off, this_len); ++ if (res < 0) { ++ if (!copied) ++ return res; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ read_len = res; ++ res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); ++ if (res < 0) { ++ if (!copied) ++ return res; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ ++ if (res < this_len) ++ break; ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; ++} ++ ++#ifdef HAVE_SPLICE ++static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) ++{ ++ int splice_flags = 0; ++ off_t *srcpos = NULL; ++ off_t *dstpos = NULL; ++ off_t srcpos_val; ++ off_t dstpos_val; ++ ssize_t res; ++ size_t copied = 0; ++ ++ if (flags & FUSE_BUF_SPLICE_MOVE) ++ splice_flags |= SPLICE_F_MOVE; ++ if (flags & FUSE_BUF_SPLICE_NONBLOCK) ++ splice_flags |= SPLICE_F_NONBLOCK; ++ ++ if (src->flags & FUSE_BUF_FD_SEEK) { ++ srcpos_val = src->pos + src_off; ++ srcpos = &srcpos_val; ++ } ++ if (dst->flags & FUSE_BUF_FD_SEEK) { ++ dstpos_val = dst->pos + dst_off; ++ dstpos = &dstpos_val; ++ } ++ ++ while (len) { ++ res = splice(src->fd, srcpos, dst->fd, dstpos, len, ++ splice_flags); ++ if (res == -1) { ++ if (copied) ++ break; ++ ++ if (errno != EINVAL || (flags & FUSE_BUF_FORCE_SPLICE)) ++ return -errno; ++ ++ /* Maybe splice is not supported for this combination */ ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, ++ len); ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ if (!(src->flags & FUSE_BUF_FD_RETRY) && ++ !(dst->flags & FUSE_BUF_FD_RETRY)) { ++ break; ++ } ++ ++ len -= res; ++ } ++ ++ return copied; ++} ++#else ++static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) ++{ ++ (void) flags; ++ ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); ++} ++#endif ++ ++ ++static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) ++{ ++ int src_is_fd = src->flags & FUSE_BUF_IS_FD; ++ int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; ++ ++ if (!src_is_fd && !dst_is_fd) { ++ char *dstmem = (char *)dst->mem + dst_off; ++ char *srcmem = (char *)src->mem + src_off; ++ ++ if (dstmem != srcmem) { ++ if (dstmem + len <= srcmem || srcmem + len <= dstmem) ++ memcpy(dstmem, srcmem, len); ++ else ++ memmove(dstmem, srcmem, len); ++ } ++ ++ return len; ++ } else if (!src_is_fd) { ++ return fuse_buf_write(dst, dst_off, src, src_off, len); ++ } else if (!dst_is_fd) { ++ return fuse_buf_read(dst, dst_off, src, src_off, len); ++ } else if (flags & FUSE_BUF_NO_SPLICE) { ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); ++ } else { ++ return fuse_buf_splice(dst, dst_off, src, src_off, len, flags); ++ } ++} ++ ++static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv) ++{ ++ if (bufv->idx < bufv->count) ++ return &bufv->buf[bufv->idx]; ++ else ++ return NULL; ++} ++ ++static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) ++{ ++ const struct fuse_buf *buf = fuse_bufvec_current(bufv); ++ ++ bufv->off += len; ++ assert(bufv->off <= buf->size); ++ if (bufv->off == buf->size) { ++ assert(bufv->idx < bufv->count); ++ bufv->idx++; ++ if (bufv->idx == bufv->count) ++ return 0; ++ bufv->off = 0; ++ } ++ return 1; ++} ++ ++ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, ++ enum fuse_buf_copy_flags flags) ++{ ++ size_t copied = 0; ++ ++ if (dstv == srcv) ++ return fuse_buf_size(dstv); ++ ++ for (;;) { ++ const struct fuse_buf *src = fuse_bufvec_current(srcv); ++ const struct fuse_buf *dst = fuse_bufvec_current(dstv); ++ size_t src_len; ++ size_t dst_len; ++ size_t len; ++ ssize_t res; ++ ++ if (src == NULL || dst == NULL) ++ break; ++ ++ src_len = src->size - srcv->off; ++ dst_len = dst->size - dstv->off; ++ len = min_size(src_len, dst_len); ++ ++ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); ++ if (res < 0) { ++ if (!copied) ++ return res; ++ break; ++ } ++ copied += res; ++ ++ if (!fuse_bufvec_advance(srcv, res) || ++ !fuse_bufvec_advance(dstv, res)) ++ break; ++ ++ if (res < len) ++ break; ++ } ++ ++ return copied; ++} +diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c +new file mode 100644 +index 0000000..0d268ab +--- /dev/null ++++ b/tools/virtiofsd/fuse_log.c +@@ -0,0 +1,40 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2019 Red Hat, Inc. ++ ++ Logging API. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "fuse_log.h" ++ ++#include ++#include ++ ++static void default_log_func( ++ __attribute__(( unused )) enum fuse_log_level level, ++ const char *fmt, va_list ap) ++{ ++ vfprintf(stderr, fmt, ap); ++} ++ ++static fuse_log_func_t log_func = default_log_func; ++ ++void fuse_set_log_func(fuse_log_func_t func) ++{ ++ if (!func) ++ func = default_log_func; ++ ++ log_func = func; ++} ++ ++void fuse_log(enum fuse_log_level level, const char *fmt, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ log_func(level, fmt, ap); ++ va_end(ap); ++} +diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c +new file mode 100644 +index 0000000..93066b9 +--- /dev/null ++++ b/tools/virtiofsd/fuse_opt.c +@@ -0,0 +1,423 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Implementation of option parsing routines (dealing with `struct ++ fuse_args`). ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_opt.h" ++#include "fuse_misc.h" ++ ++#include ++#include ++#include ++#include ++ ++struct fuse_opt_context { ++ void *data; ++ const struct fuse_opt *opt; ++ fuse_opt_proc_t proc; ++ int argctr; ++ int argc; ++ char **argv; ++ struct fuse_args outargs; ++ char *opts; ++ int nonopt; ++}; ++ ++void fuse_opt_free_args(struct fuse_args *args) ++{ ++ if (args) { ++ if (args->argv && args->allocated) { ++ int i; ++ for (i = 0; i < args->argc; i++) ++ free(args->argv[i]); ++ free(args->argv); ++ } ++ args->argc = 0; ++ args->argv = NULL; ++ args->allocated = 0; ++ } ++} ++ ++static int alloc_failed(void) ++{ ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++} ++ ++int fuse_opt_add_arg(struct fuse_args *args, const char *arg) ++{ ++ char **newargv; ++ char *newarg; ++ ++ assert(!args->argv || args->allocated); ++ ++ newarg = strdup(arg); ++ if (!newarg) ++ return alloc_failed(); ++ ++ newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); ++ if (!newargv) { ++ free(newarg); ++ return alloc_failed(); ++ } ++ ++ args->argv = newargv; ++ args->allocated = 1; ++ args->argv[args->argc++] = newarg; ++ args->argv[args->argc] = NULL; ++ return 0; ++} ++ ++static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos, ++ const char *arg) ++{ ++ assert(pos <= args->argc); ++ if (fuse_opt_add_arg(args, arg) == -1) ++ return -1; ++ ++ if (pos != args->argc - 1) { ++ char *newarg = args->argv[args->argc - 1]; ++ memmove(&args->argv[pos + 1], &args->argv[pos], ++ sizeof(char *) * (args->argc - pos - 1)); ++ args->argv[pos] = newarg; ++ } ++ return 0; ++} ++ ++int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg) ++{ ++ return fuse_opt_insert_arg_common(args, pos, arg); ++} ++ ++static int next_arg(struct fuse_opt_context *ctx, const char *opt) ++{ ++ if (ctx->argctr + 1 >= ctx->argc) { ++ fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); ++ return -1; ++ } ++ ctx->argctr++; ++ return 0; ++} ++ ++static int add_arg(struct fuse_opt_context *ctx, const char *arg) ++{ ++ return fuse_opt_add_arg(&ctx->outargs, arg); ++} ++ ++static int add_opt_common(char **opts, const char *opt, int esc) ++{ ++ unsigned oldlen = *opts ? strlen(*opts) : 0; ++ char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); ++ ++ if (!d) ++ return alloc_failed(); ++ ++ *opts = d; ++ if (oldlen) { ++ d += oldlen; ++ *d++ = ','; ++ } ++ ++ for (; *opt; opt++) { ++ if (esc && (*opt == ',' || *opt == '\\')) ++ *d++ = '\\'; ++ *d++ = *opt; ++ } ++ *d = '\0'; ++ ++ return 0; ++} ++ ++int fuse_opt_add_opt(char **opts, const char *opt) ++{ ++ return add_opt_common(opts, opt, 0); ++} ++ ++int fuse_opt_add_opt_escaped(char **opts, const char *opt) ++{ ++ return add_opt_common(opts, opt, 1); ++} ++ ++static int add_opt(struct fuse_opt_context *ctx, const char *opt) ++{ ++ return add_opt_common(&ctx->opts, opt, 1); ++} ++ ++static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key, ++ int iso) ++{ ++ if (key == FUSE_OPT_KEY_DISCARD) ++ return 0; ++ ++ if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { ++ int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); ++ if (res == -1 || !res) ++ return res; ++ } ++ if (iso) ++ return add_opt(ctx, arg); ++ else ++ return add_arg(ctx, arg); ++} ++ ++static int match_template(const char *t, const char *arg, unsigned *sepp) ++{ ++ int arglen = strlen(arg); ++ const char *sep = strchr(t, '='); ++ sep = sep ? sep : strchr(t, ' '); ++ if (sep && (!sep[1] || sep[1] == '%')) { ++ int tlen = sep - t; ++ if (sep[0] == '=') ++ tlen ++; ++ if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { ++ *sepp = sep - t; ++ return 1; ++ } ++ } ++ if (strcmp(t, arg) == 0) { ++ *sepp = 0; ++ return 1; ++ } ++ return 0; ++} ++ ++static const struct fuse_opt *find_opt(const struct fuse_opt *opt, ++ const char *arg, unsigned *sepp) ++{ ++ for (; opt && opt->templ; opt++) ++ if (match_template(opt->templ, arg, sepp)) ++ return opt; ++ return NULL; ++} ++ ++int fuse_opt_match(const struct fuse_opt *opts, const char *opt) ++{ ++ unsigned dummy; ++ return find_opt(opts, opt, &dummy) ? 1 : 0; ++} ++ ++static int process_opt_param(void *var, const char *format, const char *param, ++ const char *arg) ++{ ++ assert(format[0] == '%'); ++ if (format[1] == 's') { ++ char **s = var; ++ char *copy = strdup(param); ++ if (!copy) ++ return alloc_failed(); ++ ++ free(*s); ++ *s = copy; ++ } else { ++ if (sscanf(param, format, var) != 1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", arg); ++ return -1; ++ } ++ } ++ return 0; ++} ++ ++static int process_opt(struct fuse_opt_context *ctx, ++ const struct fuse_opt *opt, unsigned sep, ++ const char *arg, int iso) ++{ ++ if (opt->offset == -1U) { ++ if (call_proc(ctx, arg, opt->value, iso) == -1) ++ return -1; ++ } else { ++ void *var = (char *)ctx->data + opt->offset; ++ if (sep && opt->templ[sep + 1]) { ++ const char *param = arg + sep; ++ if (opt->templ[sep] == '=') ++ param ++; ++ if (process_opt_param(var, opt->templ + sep + 1, ++ param, arg) == -1) ++ return -1; ++ } else ++ *(int *)var = opt->value; ++ } ++ return 0; ++} ++ ++static int process_opt_sep_arg(struct fuse_opt_context *ctx, ++ const struct fuse_opt *opt, unsigned sep, ++ const char *arg, int iso) ++{ ++ int res; ++ char *newarg; ++ char *param; ++ ++ if (next_arg(ctx, arg) == -1) ++ return -1; ++ ++ param = ctx->argv[ctx->argctr]; ++ newarg = malloc(sep + strlen(param) + 1); ++ if (!newarg) ++ return alloc_failed(); ++ ++ memcpy(newarg, arg, sep); ++ strcpy(newarg + sep, param); ++ res = process_opt(ctx, opt, sep, newarg, iso); ++ free(newarg); ++ ++ return res; ++} ++ ++static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso) ++{ ++ unsigned sep; ++ const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); ++ if (opt) { ++ for (; opt; opt = find_opt(opt + 1, arg, &sep)) { ++ int res; ++ if (sep && opt->templ[sep] == ' ' && !arg[sep]) ++ res = process_opt_sep_arg(ctx, opt, sep, arg, ++ iso); ++ else ++ res = process_opt(ctx, opt, sep, arg, iso); ++ if (res == -1) ++ return -1; ++ } ++ return 0; ++ } else ++ return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); ++} ++ ++static int process_real_option_group(struct fuse_opt_context *ctx, char *opts) ++{ ++ char *s = opts; ++ char *d = s; ++ int end = 0; ++ ++ while (!end) { ++ if (*s == '\0') ++ end = 1; ++ if (*s == ',' || end) { ++ int res; ++ ++ *d = '\0'; ++ res = process_gopt(ctx, opts, 1); ++ if (res == -1) ++ return -1; ++ d = opts; ++ } else { ++ if (s[0] == '\\' && s[1] != '\0') { ++ s++; ++ if (s[0] >= '0' && s[0] <= '3' && ++ s[1] >= '0' && s[1] <= '7' && ++ s[2] >= '0' && s[2] <= '7') { ++ *d++ = (s[0] - '0') * 0100 + ++ (s[1] - '0') * 0010 + ++ (s[2] - '0'); ++ s += 2; ++ } else { ++ *d++ = *s; ++ } ++ } else { ++ *d++ = *s; ++ } ++ } ++ s++; ++ } ++ ++ return 0; ++} ++ ++static int process_option_group(struct fuse_opt_context *ctx, const char *opts) ++{ ++ int res; ++ char *copy = strdup(opts); ++ ++ if (!copy) { ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++ } ++ res = process_real_option_group(ctx, copy); ++ free(copy); ++ return res; ++} ++ ++static int process_one(struct fuse_opt_context *ctx, const char *arg) ++{ ++ if (ctx->nonopt || arg[0] != '-') ++ return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); ++ else if (arg[1] == 'o') { ++ if (arg[2]) ++ return process_option_group(ctx, arg + 2); ++ else { ++ if (next_arg(ctx, arg) == -1) ++ return -1; ++ ++ return process_option_group(ctx, ++ ctx->argv[ctx->argctr]); ++ } ++ } else if (arg[1] == '-' && !arg[2]) { ++ if (add_arg(ctx, arg) == -1) ++ return -1; ++ ctx->nonopt = ctx->outargs.argc; ++ return 0; ++ } else ++ return process_gopt(ctx, arg, 0); ++} ++ ++static int opt_parse(struct fuse_opt_context *ctx) ++{ ++ if (ctx->argc) { ++ if (add_arg(ctx, ctx->argv[0]) == -1) ++ return -1; ++ } ++ ++ for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) ++ if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) ++ return -1; ++ ++ if (ctx->opts) { ++ if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || ++ fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) ++ return -1; ++ } ++ ++ /* If option separator ("--") is the last argument, remove it */ ++ if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && ++ strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { ++ free(ctx->outargs.argv[ctx->outargs.argc - 1]); ++ ctx->outargs.argv[--ctx->outargs.argc] = NULL; ++ } ++ ++ return 0; ++} ++ ++int fuse_opt_parse(struct fuse_args *args, void *data, ++ const struct fuse_opt opts[], fuse_opt_proc_t proc) ++{ ++ int res; ++ struct fuse_opt_context ctx = { ++ .data = data, ++ .opt = opts, ++ .proc = proc, ++ }; ++ ++ if (!args || !args->argv || !args->argc) ++ return 0; ++ ++ ctx.argc = args->argc; ++ ctx.argv = args->argv; ++ ++ res = opt_parse(&ctx); ++ if (res != -1) { ++ struct fuse_args tmp = *args; ++ *args = ctx.outargs; ++ ctx.outargs = tmp; ++ } ++ free(ctx.opts); ++ fuse_opt_free_args(&ctx.outargs); ++ return res; ++} +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +new file mode 100644 +index 0000000..4271947 +--- /dev/null ++++ b/tools/virtiofsd/fuse_signals.c +@@ -0,0 +1,91 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Utility functions for setting signal handlers. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "config.h" ++#include "fuse_lowlevel.h" ++#include "fuse_i.h" ++ ++#include ++#include ++#include ++#include ++ ++static struct fuse_session *fuse_instance; ++ ++static void exit_handler(int sig) ++{ ++ if (fuse_instance) { ++ fuse_session_exit(fuse_instance); ++ if(sig <= 0) { ++ fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); ++ abort(); ++ } ++ fuse_instance->error = sig; ++ } ++} ++ ++static void do_nothing(int sig) ++{ ++ (void) sig; ++} ++ ++static int set_one_signal_handler(int sig, void (*handler)(int), int remove) ++{ ++ struct sigaction sa; ++ struct sigaction old_sa; ++ ++ memset(&sa, 0, sizeof(struct sigaction)); ++ sa.sa_handler = remove ? SIG_DFL : handler; ++ sigemptyset(&(sa.sa_mask)); ++ sa.sa_flags = 0; ++ ++ if (sigaction(sig, NULL, &old_sa) == -1) { ++ perror("fuse: cannot get old signal handler"); ++ return -1; ++ } ++ ++ if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && ++ sigaction(sig, &sa, NULL) == -1) { ++ perror("fuse: cannot set signal handler"); ++ return -1; ++ } ++ return 0; ++} ++ ++int fuse_set_signal_handlers(struct fuse_session *se) ++{ ++ /* If we used SIG_IGN instead of the do_nothing function, ++ then we would be unable to tell if we set SIG_IGN (and ++ thus should reset to SIG_DFL in fuse_remove_signal_handlers) ++ or if it was already set to SIG_IGN (and should be left ++ untouched. */ ++ if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) ++ return -1; ++ ++ fuse_instance = se; ++ return 0; ++} ++ ++void fuse_remove_signal_handlers(struct fuse_session *se) ++{ ++ if (fuse_instance != se) ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: fuse_remove_signal_handlers: unknown session\n"); ++ else ++ fuse_instance = NULL; ++ ++ set_one_signal_handler(SIGHUP, exit_handler, 1); ++ set_one_signal_handler(SIGINT, exit_handler, 1); ++ set_one_signal_handler(SIGTERM, exit_handler, 1); ++ set_one_signal_handler(SIGPIPE, do_nothing, 1); ++} +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +new file mode 100644 +index 0000000..64ff7ad +--- /dev/null ++++ b/tools/virtiofsd/helper.c +@@ -0,0 +1,440 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Helper functions to create (simple) standalone programs. With the ++ aid of these functions it should be possible to create full FUSE ++ file system by implementing nothing but the request handlers. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_misc.h" ++#include "fuse_opt.h" ++#include "fuse_lowlevel.h" ++#include "mount_util.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define FUSE_HELPER_OPT(t, p) \ ++ { t, offsetof(struct fuse_cmdline_opts, p), 1 } ++ ++static const struct fuse_opt fuse_helper_opts[] = { ++ FUSE_HELPER_OPT("-h", show_help), ++ FUSE_HELPER_OPT("--help", show_help), ++ FUSE_HELPER_OPT("-V", show_version), ++ FUSE_HELPER_OPT("--version", show_version), ++ FUSE_HELPER_OPT("-d", debug), ++ FUSE_HELPER_OPT("debug", debug), ++ FUSE_HELPER_OPT("-d", foreground), ++ FUSE_HELPER_OPT("debug", foreground), ++ FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), ++ FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("-f", foreground), ++ FUSE_HELPER_OPT("-s", singlethread), ++ FUSE_HELPER_OPT("fsname=", nodefault_subtype), ++ FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), ++#ifndef __FreeBSD__ ++ FUSE_HELPER_OPT("subtype=", nodefault_subtype), ++ FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), ++#endif ++ FUSE_HELPER_OPT("clone_fd", clone_fd), ++ FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_OPT_END ++}; ++ ++struct fuse_conn_info_opts { ++ int atomic_o_trunc; ++ int no_remote_posix_lock; ++ int no_remote_flock; ++ int splice_write; ++ int splice_move; ++ int splice_read; ++ int no_splice_write; ++ int no_splice_move; ++ int no_splice_read; ++ int auto_inval_data; ++ int no_auto_inval_data; ++ int no_readdirplus; ++ int no_readdirplus_auto; ++ int async_dio; ++ int no_async_dio; ++ int writeback_cache; ++ int no_writeback_cache; ++ int async_read; ++ int sync_read; ++ unsigned max_write; ++ unsigned max_readahead; ++ unsigned max_background; ++ unsigned congestion_threshold; ++ unsigned time_gran; ++ int set_max_write; ++ int set_max_readahead; ++ int set_max_background; ++ int set_congestion_threshold; ++ int set_time_gran; ++}; ++ ++#define CONN_OPTION(t, p, v) \ ++ { t, offsetof(struct fuse_conn_info_opts, p), v } ++static const struct fuse_opt conn_info_opt_spec[] = { ++ CONN_OPTION("max_write=%u", max_write, 0), ++ CONN_OPTION("max_write=", set_max_write, 1), ++ CONN_OPTION("max_readahead=%u", max_readahead, 0), ++ CONN_OPTION("max_readahead=", set_max_readahead, 1), ++ CONN_OPTION("max_background=%u", max_background, 0), ++ CONN_OPTION("max_background=", set_max_background, 1), ++ CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), ++ CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), ++ CONN_OPTION("sync_read", sync_read, 1), ++ CONN_OPTION("async_read", async_read, 1), ++ CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), ++ CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("no_remote_lock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_flock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("splice_write", splice_write, 1), ++ CONN_OPTION("no_splice_write", no_splice_write, 1), ++ CONN_OPTION("splice_move", splice_move, 1), ++ CONN_OPTION("no_splice_move", no_splice_move, 1), ++ CONN_OPTION("splice_read", splice_read, 1), ++ CONN_OPTION("no_splice_read", no_splice_read, 1), ++ CONN_OPTION("auto_inval_data", auto_inval_data, 1), ++ CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), ++ CONN_OPTION("readdirplus=no", no_readdirplus, 1), ++ CONN_OPTION("readdirplus=yes", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), ++ CONN_OPTION("readdirplus=auto", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), ++ CONN_OPTION("async_dio", async_dio, 1), ++ CONN_OPTION("no_async_dio", no_async_dio, 1), ++ CONN_OPTION("writeback_cache", writeback_cache, 1), ++ CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), ++ CONN_OPTION("time_gran=%u", time_gran, 0), ++ CONN_OPTION("time_gran=", set_time_gran, 1), ++ FUSE_OPT_END ++}; ++ ++ ++void fuse_cmdline_help(void) ++{ ++ printf(" -h --help print help\n" ++ " -V --version print version\n" ++ " -d -o debug enable debug output (implies -f)\n" ++ " -f foreground operation\n" ++ " -s disable multi-threaded operation\n" ++ " -o clone_fd use separate fuse device fd for each thread\n" ++ " (may improve performance)\n" ++ " -o max_idle_threads the maximum number of idle worker threads\n" ++ " allowed (default: 10)\n"); ++} ++ ++static int fuse_helper_opt_proc(void *data, const char *arg, int key, ++ struct fuse_args *outargs) ++{ ++ (void) outargs; ++ struct fuse_cmdline_opts *opts = data; ++ ++ switch (key) { ++ case FUSE_OPT_KEY_NONOPT: ++ if (!opts->mountpoint) { ++ if (fuse_mnt_parse_fuse_fd(arg) != -1) { ++ return fuse_opt_add_opt(&opts->mountpoint, arg); ++ } ++ ++ char mountpoint[PATH_MAX] = ""; ++ if (realpath(arg, mountpoint) == NULL) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: bad mount point `%s': %s\n", ++ arg, strerror(errno)); ++ return -1; ++ } ++ return fuse_opt_add_opt(&opts->mountpoint, mountpoint); ++ } else { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); ++ return -1; ++ } ++ ++ default: ++ /* Pass through unknown options */ ++ return 1; ++ } ++} ++ ++/* Under FreeBSD, there is no subtype option so this ++ function actually sets the fsname */ ++static int add_default_subtype(const char *progname, struct fuse_args *args) ++{ ++ int res; ++ char *subtype_opt; ++ ++ const char *basename = strrchr(progname, '/'); ++ if (basename == NULL) ++ basename = progname; ++ else if (basename[1] != '\0') ++ basename++; ++ ++ subtype_opt = (char *) malloc(strlen(basename) + 64); ++ if (subtype_opt == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++ } ++#ifdef __FreeBSD__ ++ sprintf(subtype_opt, "-ofsname=%s", basename); ++#else ++ sprintf(subtype_opt, "-osubtype=%s", basename); ++#endif ++ res = fuse_opt_add_arg(args, subtype_opt); ++ free(subtype_opt); ++ return res; ++} ++ ++int fuse_parse_cmdline(struct fuse_args *args, ++ struct fuse_cmdline_opts *opts) ++{ ++ memset(opts, 0, sizeof(struct fuse_cmdline_opts)); ++ ++ opts->max_idle_threads = 10; ++ ++ if (fuse_opt_parse(args, opts, fuse_helper_opts, ++ fuse_helper_opt_proc) == -1) ++ return -1; ++ ++ /* *Linux*: if neither -o subtype nor -o fsname are specified, ++ set subtype to program's basename. ++ *FreeBSD*: if fsname is not specified, set to program's ++ basename. */ ++ if (!opts->nodefault_subtype) ++ if (add_default_subtype(args->argv[0], args) == -1) ++ return -1; ++ ++ return 0; ++} ++ ++ ++int fuse_daemonize(int foreground) ++{ ++ if (!foreground) { ++ int nullfd; ++ int waiter[2]; ++ char completed; ++ ++ if (pipe(waiter)) { ++ perror("fuse_daemonize: pipe"); ++ return -1; ++ } ++ ++ /* ++ * demonize current process by forking it and killing the ++ * parent. This makes current process as a child of 'init'. ++ */ ++ switch(fork()) { ++ case -1: ++ perror("fuse_daemonize: fork"); ++ return -1; ++ case 0: ++ break; ++ default: ++ (void) read(waiter[0], &completed, sizeof(completed)); ++ _exit(0); ++ } ++ ++ if (setsid() == -1) { ++ perror("fuse_daemonize: setsid"); ++ return -1; ++ } ++ ++ (void) chdir("/"); ++ ++ nullfd = open("/dev/null", O_RDWR, 0); ++ if (nullfd != -1) { ++ (void) dup2(nullfd, 0); ++ (void) dup2(nullfd, 1); ++ (void) dup2(nullfd, 2); ++ if (nullfd > 2) ++ close(nullfd); ++ } ++ ++ /* Propagate completion of daemon initialization */ ++ completed = 1; ++ (void) write(waiter[1], &completed, sizeof(completed)); ++ close(waiter[0]); ++ close(waiter[1]); ++ } else { ++ (void) chdir("/"); ++ } ++ return 0; ++} ++ ++int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, ++ size_t op_size, void *user_data) ++{ ++ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); ++ struct fuse *fuse; ++ struct fuse_cmdline_opts opts; ++ int res; ++ ++ if (fuse_parse_cmdline(&args, &opts) != 0) ++ return 1; ++ ++ if (opts.show_version) { ++ printf("FUSE library version %s\n", PACKAGE_VERSION); ++ fuse_lowlevel_version(); ++ res = 0; ++ goto out1; ++ } ++ ++ if (opts.show_help) { ++ if(args.argv[0][0] != '\0') ++ printf("usage: %s [options] \n\n", ++ args.argv[0]); ++ printf("FUSE options:\n"); ++ fuse_cmdline_help(); ++ fuse_lib_help(&args); ++ res = 0; ++ goto out1; ++ } ++ ++ if (!opts.show_help && ++ !opts.mountpoint) { ++ fuse_log(FUSE_LOG_ERR, "error: no mountpoint specified\n"); ++ res = 2; ++ goto out1; ++ } ++ ++ ++ fuse = fuse_new_31(&args, op, op_size, user_data); ++ if (fuse == NULL) { ++ res = 3; ++ goto out1; ++ } ++ ++ if (fuse_mount(fuse,opts.mountpoint) != 0) { ++ res = 4; ++ goto out2; ++ } ++ ++ if (fuse_daemonize(opts.foreground) != 0) { ++ res = 5; ++ goto out3; ++ } ++ ++ struct fuse_session *se = fuse_get_session(fuse); ++ if (fuse_set_signal_handlers(se) != 0) { ++ res = 6; ++ goto out3; ++ } ++ ++ if (opts.singlethread) ++ res = fuse_loop(fuse); ++ else { ++ struct fuse_loop_config loop_config; ++ loop_config.clone_fd = opts.clone_fd; ++ loop_config.max_idle_threads = opts.max_idle_threads; ++ res = fuse_loop_mt_32(fuse, &loop_config); ++ } ++ if (res) ++ res = 7; ++ ++ fuse_remove_signal_handlers(se); ++out3: ++ fuse_unmount(fuse); ++out2: ++ fuse_destroy(fuse); ++out1: ++ free(opts.mountpoint); ++ fuse_opt_free_args(&args); ++ return res; ++} ++ ++ ++void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, ++ struct fuse_conn_info *conn) ++{ ++ if(opts->set_max_write) ++ conn->max_write = opts->max_write; ++ if(opts->set_max_background) ++ conn->max_background = opts->max_background; ++ if(opts->set_congestion_threshold) ++ conn->congestion_threshold = opts->congestion_threshold; ++ if(opts->set_time_gran) ++ conn->time_gran = opts->time_gran; ++ if(opts->set_max_readahead) ++ conn->max_readahead = opts->max_readahead; ++ ++#define LL_ENABLE(cond,cap) \ ++ if (cond) conn->want |= (cap) ++#define LL_DISABLE(cond,cap) \ ++ if (cond) conn->want &= ~(cap) ++ ++ LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); ++ LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); ++ ++ LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); ++ LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); ++ ++ LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); ++ LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); ++ ++ LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ ++ LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); ++ ++ LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); ++ LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); ++ ++ LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ ++ LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); ++ LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); ++ ++ LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); ++ LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); ++} ++ ++struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args) ++{ ++ struct fuse_conn_info_opts *opts; ++ ++ opts = calloc(1, sizeof(struct fuse_conn_info_opts)); ++ if(opts == NULL) { ++ fuse_log(FUSE_LOG_ERR, "calloc failed\n"); ++ return NULL; ++ } ++ if(fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { ++ free(opts); ++ return NULL; ++ } ++ return opts; ++} ++ ++int fuse_open_channel(const char *mountpoint, const char* options) ++{ ++ struct mount_opts *opts = NULL; ++ int fd = -1; ++ const char *argv[] = { "", "-o", options }; ++ int argc = sizeof(argv) / sizeof(argv[0]); ++ struct fuse_args args = FUSE_ARGS_INIT(argc, (char**) argv); ++ ++ opts = parse_mount_opts(&args); ++ if (opts == NULL) ++ return -1; ++ ++ fd = fuse_kern_mount(mountpoint, opts); ++ destroy_mount_opts(opts); ++ ++ return fd; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-fuse_lowlevel.c.patch b/kvm-virtiofsd-Add-fuse_lowlevel.c.patch new file mode 100755 index 0000000..1318fef --- /dev/null +++ b/kvm-virtiofsd-Add-fuse_lowlevel.c.patch @@ -0,0 +1,3172 @@ +From f6c6830f772e8060255323d2a458cd0e774d9654 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:42 +0100 +Subject: [PATCH 011/116] virtiofsd: Add fuse_lowlevel.c +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-8-dgilbert@redhat.com> +Patchwork-id: 93456 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 007/112] virtiofsd: Add fuse_lowlevel.c +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +fuse_lowlevel is one of the largest files from the library +and does most of the work. Add it separately to keep the diff +sizes small. +Again this is from upstream fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2de121f01e37e2fe98a4362f4abf7c0848697f76) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 3129 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 3129 insertions(+) + create mode 100644 tools/virtiofsd/fuse_lowlevel.c + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +new file mode 100644 +index 0000000..f2d7038 +--- /dev/null ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -0,0 +1,3129 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Implementation of (most of) the low-level FUSE API. The session loop ++ functions are implemented in separate files. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#define _GNU_SOURCE ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_kernel.h" ++#include "fuse_opt.h" ++#include "fuse_misc.h" ++#include "mount_util.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef F_LINUX_SPECIFIC_BASE ++#define F_LINUX_SPECIFIC_BASE 1024 ++#endif ++#ifndef F_SETPIPE_SZ ++#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) ++#endif ++ ++ ++#define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) ++#define OFFSET_MAX 0x7fffffffffffffffLL ++ ++#define container_of(ptr, type, member) ({ \ ++ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ ++ (type *)( (char *)__mptr - offsetof(type,member) );}) ++ ++struct fuse_pollhandle { ++ uint64_t kh; ++ struct fuse_session *se; ++}; ++ ++static size_t pagesize; ++ ++static __attribute__((constructor)) void fuse_ll_init_pagesize(void) ++{ ++ pagesize = getpagesize(); ++} ++ ++static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) ++{ ++ attr->ino = stbuf->st_ino; ++ attr->mode = stbuf->st_mode; ++ attr->nlink = stbuf->st_nlink; ++ attr->uid = stbuf->st_uid; ++ attr->gid = stbuf->st_gid; ++ attr->rdev = stbuf->st_rdev; ++ attr->size = stbuf->st_size; ++ attr->blksize = stbuf->st_blksize; ++ attr->blocks = stbuf->st_blocks; ++ attr->atime = stbuf->st_atime; ++ attr->mtime = stbuf->st_mtime; ++ attr->ctime = stbuf->st_ctime; ++ attr->atimensec = ST_ATIM_NSEC(stbuf); ++ attr->mtimensec = ST_MTIM_NSEC(stbuf); ++ attr->ctimensec = ST_CTIM_NSEC(stbuf); ++} ++ ++static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) ++{ ++ stbuf->st_mode = attr->mode; ++ stbuf->st_uid = attr->uid; ++ stbuf->st_gid = attr->gid; ++ stbuf->st_size = attr->size; ++ stbuf->st_atime = attr->atime; ++ stbuf->st_mtime = attr->mtime; ++ stbuf->st_ctime = attr->ctime; ++ ST_ATIM_NSEC_SET(stbuf, attr->atimensec); ++ ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); ++ ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); ++} ++ ++static size_t iov_length(const struct iovec *iov, size_t count) ++{ ++ size_t seg; ++ size_t ret = 0; ++ ++ for (seg = 0; seg < count; seg++) ++ ret += iov[seg].iov_len; ++ return ret; ++} ++ ++static void list_init_req(struct fuse_req *req) ++{ ++ req->next = req; ++ req->prev = req; ++} ++ ++static void list_del_req(struct fuse_req *req) ++{ ++ struct fuse_req *prev = req->prev; ++ struct fuse_req *next = req->next; ++ prev->next = next; ++ next->prev = prev; ++} ++ ++static void list_add_req(struct fuse_req *req, struct fuse_req *next) ++{ ++ struct fuse_req *prev = next->prev; ++ req->next = next; ++ req->prev = prev; ++ prev->next = req; ++ next->prev = req; ++} ++ ++static void destroy_req(fuse_req_t req) ++{ ++ pthread_mutex_destroy(&req->lock); ++ free(req); ++} ++ ++void fuse_free_req(fuse_req_t req) ++{ ++ int ctr; ++ struct fuse_session *se = req->se; ++ ++ pthread_mutex_lock(&se->lock); ++ req->u.ni.func = NULL; ++ req->u.ni.data = NULL; ++ list_del_req(req); ++ ctr = --req->ctr; ++ fuse_chan_put(req->ch); ++ req->ch = NULL; ++ pthread_mutex_unlock(&se->lock); ++ if (!ctr) ++ destroy_req(req); ++} ++ ++static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se) ++{ ++ struct fuse_req *req; ++ ++ req = (struct fuse_req *) calloc(1, sizeof(struct fuse_req)); ++ if (req == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); ++ } else { ++ req->se = se; ++ req->ctr = 1; ++ list_init_req(req); ++ fuse_mutex_init(&req->lock); ++ } ++ ++ return req; ++} ++ ++/* Send data. If *ch* is NULL, send via session master fd */ ++static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count) ++{ ++ struct fuse_out_header *out = iov[0].iov_base; ++ ++ out->len = iov_length(iov, count); ++ if (se->debug) { ++ if (out->unique == 0) { ++ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", ++ out->error, out->len); ++ } else if (out->error) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, error: %i (%s), outsize: %i\n", ++ (unsigned long long) out->unique, out->error, ++ strerror(-out->error), out->len); ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, success, outsize: %i\n", ++ (unsigned long long) out->unique, out->len); ++ } ++ } ++ ++ ssize_t res = writev(ch ? ch->fd : se->fd, ++ iov, count); ++ int err = errno; ++ ++ if (res == -1) { ++ assert(se != NULL); ++ ++ /* ENOENT means the operation was interrupted */ ++ if (!fuse_session_exited(se) && err != ENOENT) ++ perror("fuse: writing device"); ++ return -err; ++ } ++ ++ return 0; ++} ++ ++ ++int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, ++ int count) ++{ ++ struct fuse_out_header out; ++ ++ if (error <= -1000 || error > 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); ++ error = -ERANGE; ++ } ++ ++ out.unique = req->unique; ++ out.error = error; ++ ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); ++ ++ return fuse_send_msg(req->se, req->ch, iov, count); ++} ++ ++static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov, ++ int count) ++{ ++ int res; ++ ++ res = fuse_send_reply_iov_nofree(req, error, iov, count); ++ fuse_free_req(req); ++ return res; ++} ++ ++static int send_reply(fuse_req_t req, int error, const void *arg, ++ size_t argsize) ++{ ++ struct iovec iov[2]; ++ int count = 1; ++ if (argsize) { ++ iov[1].iov_base = (void *) arg; ++ iov[1].iov_len = argsize; ++ count++; ++ } ++ return send_reply_iov(req, error, iov, count); ++} ++ ++int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count) ++{ ++ int res; ++ struct iovec *padded_iov; ++ ++ padded_iov = malloc((count + 1) * sizeof(struct iovec)); ++ if (padded_iov == NULL) ++ return fuse_reply_err(req, ENOMEM); ++ ++ memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); ++ count++; ++ ++ res = send_reply_iov(req, 0, padded_iov, count); ++ free(padded_iov); ++ ++ return res; ++} ++ ++ ++/* `buf` is allowed to be empty so that the proper size may be ++ allocated by the caller */ ++size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, const struct stat *stbuf, off_t off) ++{ ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ struct fuse_dirent *dirent; ++ ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ ++ if ((buf == NULL) || (entlen_padded > bufsize)) ++ return entlen_padded; ++ ++ dirent = (struct fuse_dirent*) buf; ++ dirent->ino = stbuf->st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (stbuf->st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ ++ return entlen_padded; ++} ++ ++static void convert_statfs(const struct statvfs *stbuf, ++ struct fuse_kstatfs *kstatfs) ++{ ++ kstatfs->bsize = stbuf->f_bsize; ++ kstatfs->frsize = stbuf->f_frsize; ++ kstatfs->blocks = stbuf->f_blocks; ++ kstatfs->bfree = stbuf->f_bfree; ++ kstatfs->bavail = stbuf->f_bavail; ++ kstatfs->files = stbuf->f_files; ++ kstatfs->ffree = stbuf->f_ffree; ++ kstatfs->namelen = stbuf->f_namemax; ++} ++ ++static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) ++{ ++ return send_reply(req, 0, arg, argsize); ++} ++ ++int fuse_reply_err(fuse_req_t req, int err) ++{ ++ return send_reply(req, -err, NULL, 0); ++} ++ ++void fuse_reply_none(fuse_req_t req) ++{ ++ fuse_free_req(req); ++} ++ ++static unsigned long calc_timeout_sec(double t) ++{ ++ if (t > (double) ULONG_MAX) ++ return ULONG_MAX; ++ else if (t < 0.0) ++ return 0; ++ else ++ return (unsigned long) t; ++} ++ ++static unsigned int calc_timeout_nsec(double t) ++{ ++ double f = t - (double) calc_timeout_sec(t); ++ if (f < 0.0) ++ return 0; ++ else if (f >= 0.999999999) ++ return 999999999; ++ else ++ return (unsigned int) (f * 1.0e9); ++} ++ ++static void fill_entry(struct fuse_entry_out *arg, ++ const struct fuse_entry_param *e) ++{ ++ arg->nodeid = e->ino; ++ arg->generation = e->generation; ++ arg->entry_valid = calc_timeout_sec(e->entry_timeout); ++ arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); ++ arg->attr_valid = calc_timeout_sec(e->attr_timeout); ++ arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); ++ convert_stat(&e->attr, &arg->attr); ++} ++ ++/* `buf` is allowed to be empty so that the proper size may be ++ allocated by the caller */ ++size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, ++ const struct fuse_entry_param *e, off_t off) ++{ ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ if ((buf == NULL) || (entlen_padded > bufsize)) ++ return entlen_padded; ++ ++ struct fuse_direntplus *dp = (struct fuse_direntplus *) buf; ++ memset(&dp->entry_out, 0, sizeof(dp->entry_out)); ++ fill_entry(&dp->entry_out, e); ++ ++ struct fuse_dirent *dirent = &dp->dirent; ++ dirent->ino = e->attr.st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (e->attr.st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ ++ return entlen_padded; ++} ++ ++static void fill_open(struct fuse_open_out *arg, ++ const struct fuse_file_info *f) ++{ ++ arg->fh = f->fh; ++ if (f->direct_io) ++ arg->open_flags |= FOPEN_DIRECT_IO; ++ if (f->keep_cache) ++ arg->open_flags |= FOPEN_KEEP_CACHE; ++ if (f->cache_readdir) ++ arg->open_flags |= FOPEN_CACHE_DIR; ++ if (f->nonseekable) ++ arg->open_flags |= FOPEN_NONSEEKABLE; ++} ++ ++int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) ++{ ++ struct fuse_entry_out arg; ++ size_t size = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(arg); ++ ++ /* before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant ++ negative entry */ ++ if (!e->ino && req->se->conn.proto_minor < 4) ++ return fuse_reply_err(req, ENOENT); ++ ++ memset(&arg, 0, sizeof(arg)); ++ fill_entry(&arg, e); ++ return send_reply_ok(req, &arg, size); ++} ++ ++int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, ++ const struct fuse_file_info *f) ++{ ++ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; ++ size_t entrysize = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(struct fuse_entry_out); ++ struct fuse_entry_out *earg = (struct fuse_entry_out *) buf; ++ struct fuse_open_out *oarg = (struct fuse_open_out *) (buf + entrysize); ++ ++ memset(buf, 0, sizeof(buf)); ++ fill_entry(earg, e); ++ fill_open(oarg, f); ++ return send_reply_ok(req, buf, ++ entrysize + sizeof(struct fuse_open_out)); ++} ++ ++int fuse_reply_attr(fuse_req_t req, const struct stat *attr, ++ double attr_timeout) ++{ ++ struct fuse_attr_out arg; ++ size_t size = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.attr_valid = calc_timeout_sec(attr_timeout); ++ arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); ++ convert_stat(attr, &arg.attr); ++ ++ return send_reply_ok(req, &arg, size); ++} ++ ++int fuse_reply_readlink(fuse_req_t req, const char *linkname) ++{ ++ return send_reply_ok(req, linkname, strlen(linkname)); ++} ++ ++int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f) ++{ ++ struct fuse_open_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ fill_open(&arg, f); ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_write(fuse_req_t req, size_t count) ++{ ++ struct fuse_write_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size) ++{ ++ return send_reply_ok(req, buf, size); ++} ++ ++static int fuse_send_data_iov_fallback(struct fuse_session *se, ++ struct fuse_chan *ch, ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, ++ size_t len) ++{ ++ struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); ++ void *mbuf; ++ int res; ++ ++ /* Optimize common case */ ++ if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && ++ !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { ++ /* FIXME: also avoid memory copy if there are multiple buffers ++ but none of them contain an fd */ ++ ++ iov[iov_count].iov_base = buf->buf[0].mem; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ return fuse_send_msg(se, ch, iov, iov_count); ++ } ++ ++ res = posix_memalign(&mbuf, pagesize, len); ++ if (res != 0) ++ return res; ++ ++ mem_buf.buf[0].mem = mbuf; ++ res = fuse_buf_copy(&mem_buf, buf, 0); ++ if (res < 0) { ++ free(mbuf); ++ return -res; ++ } ++ len = res; ++ ++ iov[iov_count].iov_base = mbuf; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ res = fuse_send_msg(se, ch, iov, iov_count); ++ free(mbuf); ++ ++ return res; ++} ++ ++struct fuse_ll_pipe { ++ size_t size; ++ int can_grow; ++ int pipe[2]; ++}; ++ ++static void fuse_ll_pipe_free(struct fuse_ll_pipe *llp) ++{ ++ close(llp->pipe[0]); ++ close(llp->pipe[1]); ++ free(llp); ++} ++ ++#ifdef HAVE_SPLICE ++#if !defined(HAVE_PIPE2) || !defined(O_CLOEXEC) ++static int fuse_pipe(int fds[2]) ++{ ++ int rv = pipe(fds); ++ ++ if (rv == -1) ++ return rv; ++ ++ if (fcntl(fds[0], F_SETFL, O_NONBLOCK) == -1 || ++ fcntl(fds[1], F_SETFL, O_NONBLOCK) == -1 || ++ fcntl(fds[0], F_SETFD, FD_CLOEXEC) == -1 || ++ fcntl(fds[1], F_SETFD, FD_CLOEXEC) == -1) { ++ close(fds[0]); ++ close(fds[1]); ++ rv = -1; ++ } ++ return rv; ++} ++#else ++static int fuse_pipe(int fds[2]) ++{ ++ return pipe2(fds, O_CLOEXEC | O_NONBLOCK); ++} ++#endif ++ ++static struct fuse_ll_pipe *fuse_ll_get_pipe(struct fuse_session *se) ++{ ++ struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); ++ if (llp == NULL) { ++ int res; ++ ++ llp = malloc(sizeof(struct fuse_ll_pipe)); ++ if (llp == NULL) ++ return NULL; ++ ++ res = fuse_pipe(llp->pipe); ++ if (res == -1) { ++ free(llp); ++ return NULL; ++ } ++ ++ /* ++ *the default size is 16 pages on linux ++ */ ++ llp->size = pagesize * 16; ++ llp->can_grow = 1; ++ ++ pthread_setspecific(se->pipe_key, llp); ++ } ++ ++ return llp; ++} ++#endif ++ ++static void fuse_ll_clear_pipe(struct fuse_session *se) ++{ ++ struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); ++ if (llp) { ++ pthread_setspecific(se->pipe_key, NULL); ++ fuse_ll_pipe_free(llp); ++ } ++} ++ ++#if defined(HAVE_SPLICE) && defined(HAVE_VMSPLICE) ++static int read_back(int fd, char *buf, size_t len) ++{ ++ int res; ++ ++ res = read(fd, buf, len); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: internal error: failed to read back from pipe: %s\n", strerror(errno)); ++ return -EIO; ++ } ++ if (res != len) { ++ fuse_log(FUSE_LOG_ERR, "fuse: internal error: short read back from pipe: %i from %zi\n", res, len); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int grow_pipe_to_max(int pipefd) ++{ ++ int max; ++ int res; ++ int maxfd; ++ char buf[32]; ++ ++ maxfd = open("/proc/sys/fs/pipe-max-size", O_RDONLY); ++ if (maxfd < 0) ++ return -errno; ++ ++ res = read(maxfd, buf, sizeof(buf) - 1); ++ if (res < 0) { ++ int saved_errno; ++ ++ saved_errno = errno; ++ close(maxfd); ++ return -saved_errno; ++ } ++ close(maxfd); ++ buf[res] = '\0'; ++ ++ max = atoi(buf); ++ res = fcntl(pipefd, F_SETPIPE_SZ, max); ++ if (res < 0) ++ return -errno; ++ return max; ++} ++ ++static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, unsigned int flags) ++{ ++ int res; ++ size_t len = fuse_buf_size(buf); ++ struct fuse_out_header *out = iov[0].iov_base; ++ struct fuse_ll_pipe *llp; ++ int splice_flags; ++ size_t pipesize; ++ size_t total_fd_size; ++ size_t idx; ++ size_t headerlen; ++ struct fuse_bufvec pipe_buf = FUSE_BUFVEC_INIT(len); ++ ++ if (se->broken_splice_nonblock) ++ goto fallback; ++ ++ if (flags & FUSE_BUF_NO_SPLICE) ++ goto fallback; ++ ++ total_fd_size = 0; ++ for (idx = buf->idx; idx < buf->count; idx++) { ++ if (buf->buf[idx].flags & FUSE_BUF_IS_FD) { ++ total_fd_size = buf->buf[idx].size; ++ if (idx == buf->idx) ++ total_fd_size -= buf->off; ++ } ++ } ++ if (total_fd_size < 2 * pagesize) ++ goto fallback; ++ ++ if (se->conn.proto_minor < 14 || ++ !(se->conn.want & FUSE_CAP_SPLICE_WRITE)) ++ goto fallback; ++ ++ llp = fuse_ll_get_pipe(se); ++ if (llp == NULL) ++ goto fallback; ++ ++ ++ headerlen = iov_length(iov, iov_count); ++ ++ out->len = headerlen + len; ++ ++ /* ++ * Heuristic for the required pipe size, does not work if the ++ * source contains less than page size fragments ++ */ ++ pipesize = pagesize * (iov_count + buf->count + 1) + out->len; ++ ++ if (llp->size < pipesize) { ++ if (llp->can_grow) { ++ res = fcntl(llp->pipe[0], F_SETPIPE_SZ, pipesize); ++ if (res == -1) { ++ res = grow_pipe_to_max(llp->pipe[0]); ++ if (res > 0) ++ llp->size = res; ++ llp->can_grow = 0; ++ goto fallback; ++ } ++ llp->size = res; ++ } ++ if (llp->size < pipesize) ++ goto fallback; ++ } ++ ++ ++ res = vmsplice(llp->pipe[1], iov, iov_count, SPLICE_F_NONBLOCK); ++ if (res == -1) ++ goto fallback; ++ ++ if (res != headerlen) { ++ res = -EIO; ++ fuse_log(FUSE_LOG_ERR, "fuse: short vmsplice to pipe: %u/%zu\n", res, ++ headerlen); ++ goto clear_pipe; ++ } ++ ++ pipe_buf.buf[0].flags = FUSE_BUF_IS_FD; ++ pipe_buf.buf[0].fd = llp->pipe[1]; ++ ++ res = fuse_buf_copy(&pipe_buf, buf, ++ FUSE_BUF_FORCE_SPLICE | FUSE_BUF_SPLICE_NONBLOCK); ++ if (res < 0) { ++ if (res == -EAGAIN || res == -EINVAL) { ++ /* ++ * Should only get EAGAIN on kernels with ++ * broken SPLICE_F_NONBLOCK support (<= ++ * 2.6.35) where this error or a short read is ++ * returned even if the pipe itself is not ++ * full ++ * ++ * EINVAL might mean that splice can't handle ++ * this combination of input and output. ++ */ ++ if (res == -EAGAIN) ++ se->broken_splice_nonblock = 1; ++ ++ pthread_setspecific(se->pipe_key, NULL); ++ fuse_ll_pipe_free(llp); ++ goto fallback; ++ } ++ res = -res; ++ goto clear_pipe; ++ } ++ ++ if (res != 0 && res < len) { ++ struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); ++ void *mbuf; ++ size_t now_len = res; ++ /* ++ * For regular files a short count is either ++ * 1) due to EOF, or ++ * 2) because of broken SPLICE_F_NONBLOCK (see above) ++ * ++ * For other inputs it's possible that we overflowed ++ * the pipe because of small buffer fragments. ++ */ ++ ++ res = posix_memalign(&mbuf, pagesize, len); ++ if (res != 0) ++ goto clear_pipe; ++ ++ mem_buf.buf[0].mem = mbuf; ++ mem_buf.off = now_len; ++ res = fuse_buf_copy(&mem_buf, buf, 0); ++ if (res > 0) { ++ char *tmpbuf; ++ size_t extra_len = res; ++ /* ++ * Trickiest case: got more data. Need to get ++ * back the data from the pipe and then fall ++ * back to regular write. ++ */ ++ tmpbuf = malloc(headerlen); ++ if (tmpbuf == NULL) { ++ free(mbuf); ++ res = ENOMEM; ++ goto clear_pipe; ++ } ++ res = read_back(llp->pipe[0], tmpbuf, headerlen); ++ free(tmpbuf); ++ if (res != 0) { ++ free(mbuf); ++ goto clear_pipe; ++ } ++ res = read_back(llp->pipe[0], mbuf, now_len); ++ if (res != 0) { ++ free(mbuf); ++ goto clear_pipe; ++ } ++ len = now_len + extra_len; ++ iov[iov_count].iov_base = mbuf; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ res = fuse_send_msg(se, ch, iov, iov_count); ++ free(mbuf); ++ return res; ++ } ++ free(mbuf); ++ res = now_len; ++ } ++ len = res; ++ out->len = headerlen + len; ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, success, outsize: %i (splice)\n", ++ (unsigned long long) out->unique, out->len); ++ } ++ ++ splice_flags = 0; ++ if ((flags & FUSE_BUF_SPLICE_MOVE) && ++ (se->conn.want & FUSE_CAP_SPLICE_MOVE)) ++ splice_flags |= SPLICE_F_MOVE; ++ ++ res = splice(llp->pipe[0], NULL, ch ? ch->fd : se->fd, ++ NULL, out->len, splice_flags); ++ if (res == -1) { ++ res = -errno; ++ perror("fuse: splice from pipe"); ++ goto clear_pipe; ++ } ++ if (res != out->len) { ++ res = -EIO; ++ fuse_log(FUSE_LOG_ERR, "fuse: short splice from pipe: %u/%u\n", ++ res, out->len); ++ goto clear_pipe; ++ } ++ return 0; ++ ++clear_pipe: ++ fuse_ll_clear_pipe(se); ++ return res; ++ ++fallback: ++ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); ++} ++#else ++static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, unsigned int flags) ++{ ++ size_t len = fuse_buf_size(buf); ++ (void) flags; ++ ++ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); ++} ++#endif ++ ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags) ++{ ++ struct iovec iov[2]; ++ struct fuse_out_header out; ++ int res; ++ ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); ++ ++ out.unique = req->unique; ++ out.error = 0; ++ ++ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); ++ if (res <= 0) { ++ fuse_free_req(req); ++ return res; ++ } else { ++ return fuse_reply_err(req, res); ++ } ++} ++ ++int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) ++{ ++ struct fuse_statfs_out arg; ++ size_t size = req->se->conn.proto_minor < 4 ? ++ FUSE_COMPAT_STATFS_SIZE : sizeof(arg); ++ ++ memset(&arg, 0, sizeof(arg)); ++ convert_statfs(stbuf, &arg.st); ++ ++ return send_reply_ok(req, &arg, size); ++} ++ ++int fuse_reply_xattr(fuse_req_t req, size_t count) ++{ ++ struct fuse_getxattr_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_lock(fuse_req_t req, const struct flock *lock) ++{ ++ struct fuse_lk_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.lk.type = lock->l_type; ++ if (lock->l_type != F_UNLCK) { ++ arg.lk.start = lock->l_start; ++ if (lock->l_len == 0) ++ arg.lk.end = OFFSET_MAX; ++ else ++ arg.lk.end = lock->l_start + lock->l_len - 1; ++ } ++ arg.lk.pid = lock->l_pid; ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_bmap(fuse_req_t req, uint64_t idx) ++{ ++ struct fuse_bmap_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.block = idx; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov, ++ size_t count) ++{ ++ struct fuse_ioctl_iovec *fiov; ++ size_t i; ++ ++ fiov = malloc(sizeof(fiov[0]) * count); ++ if (!fiov) ++ return NULL; ++ ++ for (i = 0; i < count; i++) { ++ fiov[i].base = (uintptr_t) iov[i].iov_base; ++ fiov[i].len = iov[i].iov_len; ++ } ++ ++ return fiov; ++} ++ ++int fuse_reply_ioctl_retry(fuse_req_t req, ++ const struct iovec *in_iov, size_t in_count, ++ const struct iovec *out_iov, size_t out_count) ++{ ++ struct fuse_ioctl_out arg; ++ struct fuse_ioctl_iovec *in_fiov = NULL; ++ struct fuse_ioctl_iovec *out_fiov = NULL; ++ struct iovec iov[4]; ++ size_t count = 1; ++ int res; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.flags |= FUSE_IOCTL_RETRY; ++ arg.in_iovs = in_count; ++ arg.out_iovs = out_count; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; ++ ++ if (req->se->conn.proto_minor < 16) { ++ if (in_count) { ++ iov[count].iov_base = (void *)in_iov; ++ iov[count].iov_len = sizeof(in_iov[0]) * in_count; ++ count++; ++ } ++ ++ if (out_count) { ++ iov[count].iov_base = (void *)out_iov; ++ iov[count].iov_len = sizeof(out_iov[0]) * out_count; ++ count++; ++ } ++ } else { ++ /* Can't handle non-compat 64bit ioctls on 32bit */ ++ if (sizeof(void *) == 4 && req->ioctl_64bit) { ++ res = fuse_reply_err(req, EINVAL); ++ goto out; ++ } ++ ++ if (in_count) { ++ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); ++ if (!in_fiov) ++ goto enomem; ++ ++ iov[count].iov_base = (void *)in_fiov; ++ iov[count].iov_len = sizeof(in_fiov[0]) * in_count; ++ count++; ++ } ++ if (out_count) { ++ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); ++ if (!out_fiov) ++ goto enomem; ++ ++ iov[count].iov_base = (void *)out_fiov; ++ iov[count].iov_len = sizeof(out_fiov[0]) * out_count; ++ count++; ++ } ++ } ++ ++ res = send_reply_iov(req, 0, iov, count); ++out: ++ free(in_fiov); ++ free(out_fiov); ++ ++ return res; ++ ++enomem: ++ res = fuse_reply_err(req, ENOMEM); ++ goto out; ++} ++ ++int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size) ++{ ++ struct fuse_ioctl_out arg; ++ struct iovec iov[3]; ++ size_t count = 1; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; ++ ++ if (size) { ++ iov[count].iov_base = (char *) buf; ++ iov[count].iov_len = size; ++ count++; ++ } ++ ++ return send_reply_iov(req, 0, iov, count); ++} ++ ++int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, ++ int count) ++{ ++ struct iovec *padded_iov; ++ struct fuse_ioctl_out arg; ++ int res; ++ ++ padded_iov = malloc((count + 2) * sizeof(struct iovec)); ++ if (padded_iov == NULL) ++ return fuse_reply_err(req, ENOMEM); ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ padded_iov[1].iov_base = &arg; ++ padded_iov[1].iov_len = sizeof(arg); ++ ++ memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); ++ ++ res = send_reply_iov(req, 0, padded_iov, count + 2); ++ free(padded_iov); ++ ++ return res; ++} ++ ++int fuse_reply_poll(fuse_req_t req, unsigned revents) ++{ ++ struct fuse_poll_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.revents = revents; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_lseek(fuse_req_t req, off_t off) ++{ ++ struct fuse_lseek_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.offset = off; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.lookup) ++ req->se->op.lookup(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_forget_in *arg = (struct fuse_forget_in *) inarg; ++ ++ if (req->se->op.forget) ++ req->se->op.forget(req, nodeid, arg->nlookup); ++ else ++ fuse_reply_none(req); ++} ++ ++static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, ++ const void *inarg) ++{ ++ struct fuse_batch_forget_in *arg = (void *) inarg; ++ struct fuse_forget_one *param = (void *) PARAM(arg); ++ unsigned int i; ++ ++ (void) nodeid; ++ ++ if (req->se->op.forget_multi) { ++ req->se->op.forget_multi(req, arg->count, ++ (struct fuse_forget_data *) param); ++ } else if (req->se->op.forget) { ++ for (i = 0; i < arg->count; i++) { ++ struct fuse_forget_one *forget = ¶m[i]; ++ struct fuse_req *dummy_req; ++ ++ dummy_req = fuse_ll_alloc_req(req->se); ++ if (dummy_req == NULL) ++ break; ++ ++ dummy_req->unique = req->unique; ++ dummy_req->ctx = req->ctx; ++ dummy_req->ch = NULL; ++ ++ req->se->op.forget(dummy_req, forget->nodeid, ++ forget->nlookup); ++ } ++ fuse_reply_none(req); ++ } else { ++ fuse_reply_none(req); ++ } ++} ++ ++static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_file_info *fip = NULL; ++ struct fuse_file_info fi; ++ ++ if (req->se->conn.proto_minor >= 9) { ++ struct fuse_getattr_in *arg = (struct fuse_getattr_in *) inarg; ++ ++ if (arg->getattr_flags & FUSE_GETATTR_FH) { ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fip = &fi; ++ } ++ } ++ ++ if (req->se->op.getattr) ++ req->se->op.getattr(req, nodeid, fip); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_setattr_in *arg = (struct fuse_setattr_in *) inarg; ++ ++ if (req->se->op.setattr) { ++ struct fuse_file_info *fi = NULL; ++ struct fuse_file_info fi_store; ++ struct stat stbuf; ++ memset(&stbuf, 0, sizeof(stbuf)); ++ convert_attr(arg, &stbuf); ++ if (arg->valid & FATTR_FH) { ++ arg->valid &= ~FATTR_FH; ++ memset(&fi_store, 0, sizeof(fi_store)); ++ fi = &fi_store; ++ fi->fh = arg->fh; ++ } ++ arg->valid &= ++ FUSE_SET_ATTR_MODE | ++ FUSE_SET_ATTR_UID | ++ FUSE_SET_ATTR_GID | ++ FUSE_SET_ATTR_SIZE | ++ FUSE_SET_ATTR_ATIME | ++ FUSE_SET_ATTR_MTIME | ++ FUSE_SET_ATTR_ATIME_NOW | ++ FUSE_SET_ATTR_MTIME_NOW | ++ FUSE_SET_ATTR_CTIME; ++ ++ req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); ++ } else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_access_in *arg = (struct fuse_access_in *) inarg; ++ ++ if (req->se->op.access) ++ req->se->op.access(req, nodeid, arg->mask); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ (void) inarg; ++ ++ if (req->se->op.readlink) ++ req->se->op.readlink(req, nodeid); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_mknod_in *arg = (struct fuse_mknod_in *) inarg; ++ char *name = PARAM(arg); ++ ++ if (req->se->conn.proto_minor >= 12) ++ req->ctx.umask = arg->umask; ++ else ++ name = (char *) inarg + FUSE_COMPAT_MKNOD_IN_SIZE; ++ ++ if (req->se->op.mknod) ++ req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *) inarg; ++ ++ if (req->se->conn.proto_minor >= 12) ++ req->ctx.umask = arg->umask; ++ ++ if (req->se->op.mkdir) ++ req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.unlink) ++ req->se->op.unlink(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.rmdir) ++ req->se->op.rmdir(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ char *linkname = ((char *) inarg) + strlen((char *) inarg) + 1; ++ ++ if (req->se->op.symlink) ++ req->se->op.symlink(req, linkname, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_rename_in *arg = (struct fuse_rename_in *) inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; ++ ++ if (req->se->op.rename) ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, ++ 0); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_rename2_in *arg = (struct fuse_rename2_in *) inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; ++ ++ if (req->se->op.rename) ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, ++ arg->flags); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_link(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_link_in *arg = (struct fuse_link_in *) inarg; ++ ++ if (req->se->op.link) ++ req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_create_in *arg = (struct fuse_create_in *) inarg; ++ ++ if (req->se->op.create) { ++ struct fuse_file_info fi; ++ char *name = PARAM(arg); ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ ++ if (req->se->conn.proto_minor >= 12) ++ req->ctx.umask = arg->umask; ++ else ++ name = (char *) inarg + sizeof(struct fuse_open_in); ++ ++ req->se->op.create(req, nodeid, name, arg->mode, &fi); ++ } else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_open_in *arg = (struct fuse_open_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ ++ if (req->se->op.open) ++ req->se->op.open(req, nodeid, &fi); ++ else ++ fuse_reply_open(req, &fi); ++} ++ ++static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ ++ if (req->se->op.read) { ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 9) { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ } ++ req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); ++ } else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_write_in *arg = (struct fuse_write_in *) inarg; ++ struct fuse_file_info fi; ++ char *param; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; ++ ++ if (req->se->conn.proto_minor < 9) { ++ param = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ param = PARAM(arg); ++ } ++ ++ if (req->se->op.write) ++ req->se->op.write(req, nodeid, param, arg->size, ++ arg->offset, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ struct fuse_write_in *arg = (struct fuse_write_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; ++ ++ if (se->conn.proto_minor < 9) { ++ bufv.buf[0].mem = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ bufv.buf[0].size -= sizeof(struct fuse_in_header) + ++ FUSE_COMPAT_WRITE_IN_SIZE; ++ assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) ++ bufv.buf[0].mem = PARAM(arg); ++ ++ bufv.buf[0].size -= sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_write_in); ++ } ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); ++ fuse_reply_err(req, EIO); ++ goto out; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); ++ ++out: ++ /* Need to reset the pipe if ->write_buf() didn't consume all data */ ++ if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) ++ fuse_ll_clear_pipe(se); ++} ++ ++static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_flush_in *arg = (struct fuse_flush_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.flush = 1; ++ if (req->se->conn.proto_minor >= 7) ++ fi.lock_owner = arg->lock_owner; ++ ++ if (req->se->op.flush) ++ req->se->op.flush(req, nodeid, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_release_in *arg = (struct fuse_release_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 8) { ++ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; ++ fi.lock_owner = arg->lock_owner; ++ } ++ if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { ++ fi.flock_release = 1; ++ fi.lock_owner = arg->lock_owner; ++ } ++ ++ if (req->se->op.release) ++ req->se->op.release(req, nodeid, &fi); ++ else ++ fuse_reply_err(req, 0); ++} ++ ++static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.fsync) ++ req->se->op.fsync(req, nodeid, datasync, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_open_in *arg = (struct fuse_open_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ ++ if (req->se->op.opendir) ++ req->se->op.opendir(req, nodeid, &fi); ++ else ++ fuse_reply_open(req, &fi); ++} ++ ++static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.readdir) ++ req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.readdirplus) ++ req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_release_in *arg = (struct fuse_release_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; ++ ++ if (req->se->op.releasedir) ++ req->se->op.releasedir(req, nodeid, &fi); ++ else ++ fuse_reply_err(req, 0); ++} ++ ++static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.fsyncdir) ++ req->se->op.fsyncdir(req, nodeid, datasync, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ (void) nodeid; ++ (void) inarg; ++ ++ if (req->se->op.statfs) ++ req->se->op.statfs(req, nodeid); ++ else { ++ struct statvfs buf = { ++ .f_namemax = 255, ++ .f_bsize = 512, ++ }; ++ fuse_reply_statfs(req, &buf); ++ } ++} ++ ++static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *) inarg; ++ char *name = PARAM(arg); ++ char *value = name + strlen(name) + 1; ++ ++ if (req->se->op.setxattr) ++ req->se->op.setxattr(req, nodeid, name, value, arg->size, ++ arg->flags); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ ++ if (req->se->op.getxattr) ++ req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ ++ if (req->se->op.listxattr) ++ req->se->op.listxattr(req, nodeid, arg->size); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.removexattr) ++ req->se->op.removexattr(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void convert_fuse_file_lock(struct fuse_file_lock *fl, ++ struct flock *flock) ++{ ++ memset(flock, 0, sizeof(struct flock)); ++ flock->l_type = fl->type; ++ flock->l_whence = SEEK_SET; ++ flock->l_start = fl->start; ++ if (fl->end == OFFSET_MAX) ++ flock->l_len = 0; ++ else ++ flock->l_len = fl->end - fl->start + 1; ++ flock->l_pid = fl->pid; ++} ++ ++static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; ++ struct fuse_file_info fi; ++ struct flock flock; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; ++ ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.getlk) ++ req->se->op.getlk(req, nodeid, &fi, &flock); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, ++ const void *inarg, int sleep) ++{ ++ struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; ++ struct fuse_file_info fi; ++ struct flock flock; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; ++ ++ if (arg->lk_flags & FUSE_LK_FLOCK) { ++ int op = 0; ++ ++ switch (arg->lk.type) { ++ case F_RDLCK: ++ op = LOCK_SH; ++ break; ++ case F_WRLCK: ++ op = LOCK_EX; ++ break; ++ case F_UNLCK: ++ op = LOCK_UN; ++ break; ++ } ++ if (!sleep) ++ op |= LOCK_NB; ++ ++ if (req->se->op.flock) ++ req->se->op.flock(req, nodeid, &fi, op); ++ else ++ fuse_reply_err(req, ENOSYS); ++ } else { ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.setlk) ++ req->se->op.setlk(req, nodeid, &fi, &flock, sleep); ++ else ++ fuse_reply_err(req, ENOSYS); ++ } ++} ++ ++static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ do_setlk_common(req, nodeid, inarg, 0); ++} ++ ++static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ do_setlk_common(req, nodeid, inarg, 1); ++} ++ ++static int find_interrupted(struct fuse_session *se, struct fuse_req *req) ++{ ++ struct fuse_req *curr; ++ ++ for (curr = se->list.next; curr != &se->list; curr = curr->next) { ++ if (curr->unique == req->u.i.unique) { ++ fuse_interrupt_func_t func; ++ void *data; ++ ++ curr->ctr++; ++ pthread_mutex_unlock(&se->lock); ++ ++ /* Ugh, ugly locking */ ++ pthread_mutex_lock(&curr->lock); ++ pthread_mutex_lock(&se->lock); ++ curr->interrupted = 1; ++ func = curr->u.ni.func; ++ data = curr->u.ni.data; ++ pthread_mutex_unlock(&se->lock); ++ if (func) ++ func(curr, data); ++ pthread_mutex_unlock(&curr->lock); ++ ++ pthread_mutex_lock(&se->lock); ++ curr->ctr--; ++ if (!curr->ctr) ++ destroy_req(curr); ++ ++ return 1; ++ } ++ } ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->u.i.unique) ++ return 1; ++ } ++ return 0; ++} ++ ++static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *) inarg; ++ struct fuse_session *se = req->se; ++ ++ (void) nodeid; ++ if (se->debug) ++ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", ++ (unsigned long long) arg->unique); ++ ++ req->u.i.unique = arg->unique; ++ ++ pthread_mutex_lock(&se->lock); ++ if (find_interrupted(se, req)) ++ destroy_req(req); ++ else ++ list_add_req(req, &se->interrupts); ++ pthread_mutex_unlock(&se->lock); ++} ++ ++static struct fuse_req *check_interrupt(struct fuse_session *se, ++ struct fuse_req *req) ++{ ++ struct fuse_req *curr; ++ ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->unique) { ++ req->interrupted = 1; ++ list_del_req(curr); ++ free(curr); ++ return NULL; ++ } ++ } ++ curr = se->interrupts.next; ++ if (curr != &se->interrupts) { ++ list_del_req(curr); ++ list_init_req(curr); ++ return curr; ++ } else ++ return NULL; ++} ++ ++static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_bmap_in *arg = (struct fuse_bmap_in *) inarg; ++ ++ if (req->se->op.bmap) ++ req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *) inarg; ++ unsigned int flags = arg->flags; ++ void *in_buf = arg->in_size ? PARAM(arg) : NULL; ++ struct fuse_file_info fi; ++ ++ if (flags & FUSE_IOCTL_DIR && ++ !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { ++ fuse_reply_err(req, ENOTTY); ++ return; ++ } ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && ++ !(flags & FUSE_IOCTL_32BIT)) { ++ req->ioctl_64bit = 1; ++ } ++ ++ if (req->se->op.ioctl) ++ req->se->op.ioctl(req, nodeid, arg->cmd, ++ (void *)(uintptr_t)arg->arg, &fi, flags, ++ in_buf, arg->in_size, arg->out_size); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) ++{ ++ free(ph); ++} ++ ++static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_poll_in *arg = (struct fuse_poll_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.poll_events = arg->events; ++ ++ if (req->se->op.poll) { ++ struct fuse_pollhandle *ph = NULL; ++ ++ if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { ++ ph = malloc(sizeof(struct fuse_pollhandle)); ++ if (ph == NULL) { ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ph->kh = arg->kh; ++ ph->se = req->se; ++ } ++ ++ req->se->op.poll(req, nodeid, &fi, ph); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } ++} ++ ++static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.fallocate) ++ req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, const void *inarg) ++{ ++ struct fuse_copy_file_range_in *arg = (struct fuse_copy_file_range_in *) inarg; ++ struct fuse_file_info fi_in, fi_out; ++ ++ memset(&fi_in, 0, sizeof(fi_in)); ++ fi_in.fh = arg->fh_in; ++ ++ memset(&fi_out, 0, sizeof(fi_out)); ++ fi_out.fh = arg->fh_out; ++ ++ ++ if (req->se->op.copy_file_range) ++ req->se->op.copy_file_range(req, nodeid_in, arg->off_in, ++ &fi_in, arg->nodeid_out, ++ arg->off_out, &fi_out, arg->len, ++ arg->flags); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_lseek_in *arg = (struct fuse_lseek_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.lseek) ++ req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_init_in *arg = (struct fuse_init_in *) inarg; ++ struct fuse_init_out outarg; ++ struct fuse_session *se = req->se; ++ size_t bufsize = se->bufsize; ++ size_t outargsize = sizeof(outarg); ++ ++ (void) nodeid; ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); ++ if (arg->major == 7 && arg->minor >= 6) { ++ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); ++ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", ++ arg->max_readahead); ++ } ++ } ++ se->conn.proto_major = arg->major; ++ se->conn.proto_minor = arg->minor; ++ se->conn.capable = 0; ++ se->conn.want = 0; ++ ++ memset(&outarg, 0, sizeof(outarg)); ++ outarg.major = FUSE_KERNEL_VERSION; ++ outarg.minor = FUSE_KERNEL_MINOR_VERSION; ++ ++ if (arg->major < 7) { ++ fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", ++ arg->major, arg->minor); ++ fuse_reply_err(req, EPROTO); ++ return; ++ } ++ ++ if (arg->major > 7) { ++ /* Wait for a second INIT request with a 7.X version */ ++ send_reply_ok(req, &outarg, sizeof(outarg)); ++ return; ++ } ++ ++ if (arg->minor >= 6) { ++ if (arg->max_readahead < se->conn.max_readahead) ++ se->conn.max_readahead = arg->max_readahead; ++ if (arg->flags & FUSE_ASYNC_READ) ++ se->conn.capable |= FUSE_CAP_ASYNC_READ; ++ if (arg->flags & FUSE_POSIX_LOCKS) ++ se->conn.capable |= FUSE_CAP_POSIX_LOCKS; ++ if (arg->flags & FUSE_ATOMIC_O_TRUNC) ++ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; ++ if (arg->flags & FUSE_EXPORT_SUPPORT) ++ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; ++ if (arg->flags & FUSE_DONT_MASK) ++ se->conn.capable |= FUSE_CAP_DONT_MASK; ++ if (arg->flags & FUSE_FLOCK_LOCKS) ++ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; ++ if (arg->flags & FUSE_AUTO_INVAL_DATA) ++ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; ++ if (arg->flags & FUSE_DO_READDIRPLUS) ++ se->conn.capable |= FUSE_CAP_READDIRPLUS; ++ if (arg->flags & FUSE_READDIRPLUS_AUTO) ++ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; ++ if (arg->flags & FUSE_ASYNC_DIO) ++ se->conn.capable |= FUSE_CAP_ASYNC_DIO; ++ if (arg->flags & FUSE_WRITEBACK_CACHE) ++ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; ++ if (arg->flags & FUSE_NO_OPEN_SUPPORT) ++ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; ++ if (arg->flags & FUSE_PARALLEL_DIROPS) ++ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; ++ if (arg->flags & FUSE_POSIX_ACL) ++ se->conn.capable |= FUSE_CAP_POSIX_ACL; ++ if (arg->flags & FUSE_HANDLE_KILLPRIV) ++ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; ++ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) ++ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; ++ if (!(arg->flags & FUSE_MAX_PAGES)) { ++ size_t max_bufsize = ++ FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() ++ + FUSE_BUFFER_HEADER_SIZE; ++ if (bufsize > max_bufsize) { ++ bufsize = max_bufsize; ++ } ++ } ++ } else { ++ se->conn.max_readahead = 0; ++ } ++ ++ if (se->conn.proto_minor >= 14) { ++#ifdef HAVE_SPLICE ++#ifdef HAVE_VMSPLICE ++ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; ++#endif ++ se->conn.capable |= FUSE_CAP_SPLICE_READ; ++#endif ++ } ++ if (se->conn.proto_minor >= 18) ++ se->conn.capable |= FUSE_CAP_IOCTL_DIR; ++ ++ /* Default settings for modern filesystems. ++ * ++ * Most of these capabilities were disabled by default in ++ * libfuse2 for backwards compatibility reasons. In libfuse3, ++ * we can finally enable them by default (as long as they're ++ * supported by the kernel). ++ */ ++#define LL_SET_DEFAULT(cond, cap) \ ++ if ((cond) && (se->conn.capable & (cap))) \ ++ se->conn.want |= (cap) ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); ++ LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); ++ LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); ++ LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); ++ LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); ++ LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); ++ LL_SET_DEFAULT(se->op.getlk && se->op.setlk, ++ FUSE_CAP_POSIX_LOCKS); ++ LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); ++ LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, ++ FUSE_CAP_READDIRPLUS_AUTO); ++ se->conn.time_gran = 1; ++ ++ if (bufsize < FUSE_MIN_READ_BUFFER) { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", ++ bufsize); ++ bufsize = FUSE_MIN_READ_BUFFER; ++ } ++ se->bufsize = bufsize; ++ ++ if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) ++ se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; ++ ++ se->got_init = 1; ++ if (se->op.init) ++ se->op.init(se->userdata, &se->conn); ++ ++ if (se->conn.want & (~se->conn.capable)) { ++ fuse_log(FUSE_LOG_ERR, "fuse: error: filesystem requested capabilities " ++ "0x%x that are not supported by kernel, aborting.\n", ++ se->conn.want & (~se->conn.capable)); ++ fuse_reply_err(req, EPROTO); ++ se->error = -EPROTO; ++ fuse_session_exit(se); ++ return; ++ } ++ ++ unsigned max_read_mo = get_max_read(se->mo); ++ if (se->conn.max_read != max_read_mo) { ++ fuse_log(FUSE_LOG_ERR, "fuse: error: init() and fuse_session_new() " ++ "requested different maximum read size (%u vs %u)\n", ++ se->conn.max_read, max_read_mo); ++ fuse_reply_err(req, EPROTO); ++ se->error = -EPROTO; ++ fuse_session_exit(se); ++ return; ++ } ++ ++ if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { ++ se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; ++ } ++ if (arg->flags & FUSE_MAX_PAGES) { ++ outarg.flags |= FUSE_MAX_PAGES; ++ outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; ++ } ++ ++ /* Always enable big writes, this is superseded ++ by the max_write option */ ++ outarg.flags |= FUSE_BIG_WRITES; ++ ++ if (se->conn.want & FUSE_CAP_ASYNC_READ) ++ outarg.flags |= FUSE_ASYNC_READ; ++ if (se->conn.want & FUSE_CAP_POSIX_LOCKS) ++ outarg.flags |= FUSE_POSIX_LOCKS; ++ if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) ++ outarg.flags |= FUSE_ATOMIC_O_TRUNC; ++ if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) ++ outarg.flags |= FUSE_EXPORT_SUPPORT; ++ if (se->conn.want & FUSE_CAP_DONT_MASK) ++ outarg.flags |= FUSE_DONT_MASK; ++ if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) ++ outarg.flags |= FUSE_FLOCK_LOCKS; ++ if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) ++ outarg.flags |= FUSE_AUTO_INVAL_DATA; ++ if (se->conn.want & FUSE_CAP_READDIRPLUS) ++ outarg.flags |= FUSE_DO_READDIRPLUS; ++ if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) ++ outarg.flags |= FUSE_READDIRPLUS_AUTO; ++ if (se->conn.want & FUSE_CAP_ASYNC_DIO) ++ outarg.flags |= FUSE_ASYNC_DIO; ++ if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) ++ outarg.flags |= FUSE_WRITEBACK_CACHE; ++ if (se->conn.want & FUSE_CAP_POSIX_ACL) ++ outarg.flags |= FUSE_POSIX_ACL; ++ outarg.max_readahead = se->conn.max_readahead; ++ outarg.max_write = se->conn.max_write; ++ if (se->conn.proto_minor >= 13) { ++ if (se->conn.max_background >= (1 << 16)) ++ se->conn.max_background = (1 << 16) - 1; ++ if (se->conn.congestion_threshold > se->conn.max_background) ++ se->conn.congestion_threshold = se->conn.max_background; ++ if (!se->conn.congestion_threshold) { ++ se->conn.congestion_threshold = ++ se->conn.max_background * 3 / 4; ++ } ++ ++ outarg.max_background = se->conn.max_background; ++ outarg.congestion_threshold = se->conn.congestion_threshold; ++ } ++ if (se->conn.proto_minor >= 23) ++ outarg.time_gran = se->conn.time_gran; ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); ++ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); ++ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", ++ outarg.max_readahead); ++ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); ++ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", ++ outarg.max_background); ++ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", ++ outarg.congestion_threshold); ++ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", ++ outarg.time_gran); ++ } ++ if (arg->minor < 5) ++ outargsize = FUSE_COMPAT_INIT_OUT_SIZE; ++ else if (arg->minor < 23) ++ outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; ++ ++ send_reply_ok(req, &outarg, outargsize); ++} ++ ++static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_session *se = req->se; ++ ++ (void) nodeid; ++ (void) inarg; ++ ++ se->got_destroy = 1; ++ if (se->op.destroy) ++ se->op.destroy(se->userdata); ++ ++ send_reply_ok(req, NULL, 0); ++} ++ ++static void list_del_nreq(struct fuse_notify_req *nreq) ++{ ++ struct fuse_notify_req *prev = nreq->prev; ++ struct fuse_notify_req *next = nreq->next; ++ prev->next = next; ++ next->prev = prev; ++} ++ ++static void list_add_nreq(struct fuse_notify_req *nreq, ++ struct fuse_notify_req *next) ++{ ++ struct fuse_notify_req *prev = next->prev; ++ nreq->next = next; ++ nreq->prev = prev; ++ prev->next = nreq; ++ next->prev = nreq; ++} ++ ++static void list_init_nreq(struct fuse_notify_req *nreq) ++{ ++ nreq->next = nreq; ++ nreq->prev = nreq; ++} ++ ++static void do_notify_reply(fuse_req_t req, fuse_ino_t nodeid, ++ const void *inarg, const struct fuse_buf *buf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_notify_req *nreq; ++ struct fuse_notify_req *head; ++ ++ pthread_mutex_lock(&se->lock); ++ head = &se->notify_list; ++ for (nreq = head->next; nreq != head; nreq = nreq->next) { ++ if (nreq->unique == req->unique) { ++ list_del_nreq(nreq); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&se->lock); ++ ++ if (nreq != head) ++ nreq->reply(nreq, req, nodeid, inarg, buf); ++} ++ ++static int send_notify_iov(struct fuse_session *se, int notify_code, ++ struct iovec *iov, int count) ++{ ++ struct fuse_out_header out; ++ ++ if (!se->got_init) ++ return -ENOTCONN; ++ ++ out.unique = 0; ++ out.error = notify_code; ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); ++ ++ return fuse_send_msg(se, NULL, iov, count); ++} ++ ++int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) ++{ ++ if (ph != NULL) { ++ struct fuse_notify_poll_wakeup_out outarg; ++ struct iovec iov[2]; ++ ++ outarg.kh = ph->kh; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); ++ } else { ++ return 0; ++ } ++} ++ ++int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, ++ off_t off, off_t len) ++{ ++ struct fuse_notify_inval_inode_out outarg; ++ struct iovec iov[2]; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) ++ return -ENOSYS; ++ ++ outarg.ino = ino; ++ outarg.off = off; ++ outarg.len = len; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); ++} ++ ++int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, ++ const char *name, size_t namelen) ++{ ++ struct fuse_notify_inval_entry_out outarg; ++ struct iovec iov[3]; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) ++ return -ENOSYS; ++ ++ outarg.parent = parent; ++ outarg.namelen = namelen; ++ outarg.padding = 0; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; ++ ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); ++} ++ ++int fuse_lowlevel_notify_delete(struct fuse_session *se, ++ fuse_ino_t parent, fuse_ino_t child, ++ const char *name, size_t namelen) ++{ ++ struct fuse_notify_delete_out outarg; ++ struct iovec iov[3]; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) ++ return -ENOSYS; ++ ++ outarg.parent = parent; ++ outarg.child = child; ++ outarg.namelen = namelen; ++ outarg.padding = 0; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; ++ ++ return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); ++} ++ ++int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags) ++{ ++ struct fuse_out_header out; ++ struct fuse_notify_store_out outarg; ++ struct iovec iov[3]; ++ size_t size = fuse_buf_size(bufv); ++ int res; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) ++ return -ENOSYS; ++ ++ out.unique = 0; ++ out.error = FUSE_NOTIFY_STORE; ++ ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; ++ ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(out); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); ++ if (res > 0) ++ res = -res; ++ ++ return res; ++} ++ ++struct fuse_retrieve_req { ++ struct fuse_notify_req nreq; ++ void *cookie; ++}; ++ ++static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, ++ fuse_req_t req, fuse_ino_t ino, ++ const void *inarg, ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_retrieve_req *rreq = ++ container_of(nreq, struct fuse_retrieve_req, nreq); ++ const struct fuse_notify_retrieve_in *arg = inarg; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) ++ bufv.buf[0].mem = PARAM(arg); ++ ++ bufv.buf[0].size -= sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_notify_retrieve_in); ++ ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); ++ fuse_reply_none(req); ++ goto out; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ if (se->op.retrieve_reply) { ++ se->op.retrieve_reply(req, rreq->cookie, ino, ++ arg->offset, &bufv); ++ } else { ++ fuse_reply_none(req); ++ } ++out: ++ free(rreq); ++ if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) ++ fuse_ll_clear_pipe(se); ++} ++ ++int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, ++ size_t size, off_t offset, void *cookie) ++{ ++ struct fuse_notify_retrieve_out outarg; ++ struct iovec iov[2]; ++ struct fuse_retrieve_req *rreq; ++ int err; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) ++ return -ENOSYS; ++ ++ rreq = malloc(sizeof(*rreq)); ++ if (rreq == NULL) ++ return -ENOMEM; ++ ++ pthread_mutex_lock(&se->lock); ++ rreq->cookie = cookie; ++ rreq->nreq.unique = se->notify_ctr++; ++ rreq->nreq.reply = fuse_ll_retrieve_reply; ++ list_add_nreq(&rreq->nreq, &se->notify_list); ++ pthread_mutex_unlock(&se->lock); ++ ++ outarg.notify_unique = rreq->nreq.unique; ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); ++ if (err) { ++ pthread_mutex_lock(&se->lock); ++ list_del_nreq(&rreq->nreq); ++ pthread_mutex_unlock(&se->lock); ++ free(rreq); ++ } ++ ++ return err; ++} ++ ++void *fuse_req_userdata(fuse_req_t req) ++{ ++ return req->se->userdata; ++} ++ ++const struct fuse_ctx *fuse_req_ctx(fuse_req_t req) ++{ ++ return &req->ctx; ++} ++ ++void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, ++ void *data) ++{ ++ pthread_mutex_lock(&req->lock); ++ pthread_mutex_lock(&req->se->lock); ++ req->u.ni.func = func; ++ req->u.ni.data = data; ++ pthread_mutex_unlock(&req->se->lock); ++ if (req->interrupted && func) ++ func(req, data); ++ pthread_mutex_unlock(&req->lock); ++} ++ ++int fuse_req_interrupted(fuse_req_t req) ++{ ++ int interrupted; ++ ++ pthread_mutex_lock(&req->se->lock); ++ interrupted = req->interrupted; ++ pthread_mutex_unlock(&req->se->lock); ++ ++ return interrupted; ++} ++ ++static struct { ++ void (*func)(fuse_req_t, fuse_ino_t, const void *); ++ const char *name; ++} fuse_ll_ops[] = { ++ [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, ++ [FUSE_FORGET] = { do_forget, "FORGET" }, ++ [FUSE_GETATTR] = { do_getattr, "GETATTR" }, ++ [FUSE_SETATTR] = { do_setattr, "SETATTR" }, ++ [FUSE_READLINK] = { do_readlink, "READLINK" }, ++ [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, ++ [FUSE_MKNOD] = { do_mknod, "MKNOD" }, ++ [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, ++ [FUSE_UNLINK] = { do_unlink, "UNLINK" }, ++ [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, ++ [FUSE_RENAME] = { do_rename, "RENAME" }, ++ [FUSE_LINK] = { do_link, "LINK" }, ++ [FUSE_OPEN] = { do_open, "OPEN" }, ++ [FUSE_READ] = { do_read, "READ" }, ++ [FUSE_WRITE] = { do_write, "WRITE" }, ++ [FUSE_STATFS] = { do_statfs, "STATFS" }, ++ [FUSE_RELEASE] = { do_release, "RELEASE" }, ++ [FUSE_FSYNC] = { do_fsync, "FSYNC" }, ++ [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, ++ [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, ++ [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, ++ [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, ++ [FUSE_FLUSH] = { do_flush, "FLUSH" }, ++ [FUSE_INIT] = { do_init, "INIT" }, ++ [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, ++ [FUSE_READDIR] = { do_readdir, "READDIR" }, ++ [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, ++ [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, ++ [FUSE_GETLK] = { do_getlk, "GETLK" }, ++ [FUSE_SETLK] = { do_setlk, "SETLK" }, ++ [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, ++ [FUSE_ACCESS] = { do_access, "ACCESS" }, ++ [FUSE_CREATE] = { do_create, "CREATE" }, ++ [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, ++ [FUSE_BMAP] = { do_bmap, "BMAP" }, ++ [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, ++ [FUSE_POLL] = { do_poll, "POLL" }, ++ [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, ++ [FUSE_DESTROY] = { do_destroy, "DESTROY" }, ++ [FUSE_NOTIFY_REPLY] = { (void *) 1, "NOTIFY_REPLY" }, ++ [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, ++ [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS"}, ++ [FUSE_RENAME2] = { do_rename2, "RENAME2" }, ++ [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, ++ [FUSE_LSEEK] = { do_lseek, "LSEEK" }, ++ [CUSE_INIT] = { cuse_lowlevel_init, "CUSE_INIT" }, ++}; ++ ++#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) ++ ++static const char *opname(enum fuse_opcode opcode) ++{ ++ if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) ++ return "???"; ++ else ++ return fuse_ll_ops[opcode].name; ++} ++ ++static int fuse_ll_copy_from_pipe(struct fuse_bufvec *dst, ++ struct fuse_bufvec *src) ++{ ++ ssize_t res = fuse_buf_copy(dst, src, 0); ++ if (res < 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", strerror(-res)); ++ return res; ++ } ++ if ((size_t)res < fuse_buf_size(dst)) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); ++ return -1; ++ } ++ return 0; ++} ++ ++void fuse_session_process_buf(struct fuse_session *se, ++ const struct fuse_buf *buf) ++{ ++ fuse_session_process_buf_int(se, buf, NULL); ++} ++ ++void fuse_session_process_buf_int(struct fuse_session *se, ++ const struct fuse_buf *buf, struct fuse_chan *ch) ++{ ++ const size_t write_header_size = sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_write_in); ++ struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; ++ struct fuse_bufvec tmpbuf = FUSE_BUFVEC_INIT(write_header_size); ++ struct fuse_in_header *in; ++ const void *inarg; ++ struct fuse_req *req; ++ void *mbuf = NULL; ++ int err; ++ int res; ++ ++ if (buf->flags & FUSE_BUF_IS_FD) { ++ if (buf->size < tmpbuf.buf[0].size) ++ tmpbuf.buf[0].size = buf->size; ++ ++ mbuf = malloc(tmpbuf.buf[0].size); ++ if (mbuf == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate header\n"); ++ goto clear_pipe; ++ } ++ tmpbuf.buf[0].mem = mbuf; ++ ++ res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); ++ if (res < 0) ++ goto clear_pipe; ++ ++ in = mbuf; ++ } else { ++ in = buf->mem; ++ } ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", ++ (unsigned long long) in->unique, ++ opname((enum fuse_opcode) in->opcode), in->opcode, ++ (unsigned long long) in->nodeid, buf->size, in->pid); ++ } ++ ++ req = fuse_ll_alloc_req(se); ++ if (req == NULL) { ++ struct fuse_out_header out = { ++ .unique = in->unique, ++ .error = -ENOMEM, ++ }; ++ struct iovec iov = { ++ .iov_base = &out, ++ .iov_len = sizeof(struct fuse_out_header), ++ }; ++ ++ fuse_send_msg(se, ch, &iov, 1); ++ goto clear_pipe; ++ } ++ ++ req->unique = in->unique; ++ req->ctx.uid = in->uid; ++ req->ctx.gid = in->gid; ++ req->ctx.pid = in->pid; ++ req->ch = ch ? fuse_chan_get(ch) : NULL; ++ ++ err = EIO; ++ if (!se->got_init) { ++ enum fuse_opcode expected; ++ ++ expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; ++ if (in->opcode != expected) ++ goto reply_err; ++ } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) ++ goto reply_err; ++ ++ err = EACCES; ++ /* Implement -o allow_root */ ++ if (se->deny_others && in->uid != se->owner && in->uid != 0 && ++ in->opcode != FUSE_INIT && in->opcode != FUSE_READ && ++ in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && ++ in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && ++ in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && ++ in->opcode != FUSE_NOTIFY_REPLY && ++ in->opcode != FUSE_READDIRPLUS) ++ goto reply_err; ++ ++ err = ENOSYS; ++ if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) ++ goto reply_err; ++ if (in->opcode != FUSE_INTERRUPT) { ++ struct fuse_req *intr; ++ pthread_mutex_lock(&se->lock); ++ intr = check_interrupt(se, req); ++ list_add_req(req, &se->list); ++ pthread_mutex_unlock(&se->lock); ++ if (intr) ++ fuse_reply_err(intr, EAGAIN); ++ } ++ ++ if ((buf->flags & FUSE_BUF_IS_FD) && write_header_size < buf->size && ++ (in->opcode != FUSE_WRITE || !se->op.write_buf) && ++ in->opcode != FUSE_NOTIFY_REPLY) { ++ void *newmbuf; ++ ++ err = ENOMEM; ++ newmbuf = realloc(mbuf, buf->size); ++ if (newmbuf == NULL) ++ goto reply_err; ++ mbuf = newmbuf; ++ ++ tmpbuf = FUSE_BUFVEC_INIT(buf->size - write_header_size); ++ tmpbuf.buf[0].mem = (char *)mbuf + write_header_size; ++ ++ res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); ++ err = -res; ++ if (res < 0) ++ goto reply_err; ++ ++ in = mbuf; ++ } ++ ++ inarg = (void *) &in[1]; ++ if (in->opcode == FUSE_WRITE && se->op.write_buf) ++ do_write_buf(req, in->nodeid, inarg, buf); ++ else if (in->opcode == FUSE_NOTIFY_REPLY) ++ do_notify_reply(req, in->nodeid, inarg, buf); ++ else ++ fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); ++ ++out_free: ++ free(mbuf); ++ return; ++ ++reply_err: ++ fuse_reply_err(req, err); ++clear_pipe: ++ if (buf->flags & FUSE_BUF_IS_FD) ++ fuse_ll_clear_pipe(se); ++ goto out_free; ++} ++ ++#define LL_OPTION(n,o,v) \ ++ { n, offsetof(struct fuse_session, o), v } ++ ++static const struct fuse_opt fuse_ll_opts[] = { ++ LL_OPTION("debug", debug, 1), ++ LL_OPTION("-d", debug, 1), ++ LL_OPTION("--debug", debug, 1), ++ LL_OPTION("allow_root", deny_others, 1), ++ FUSE_OPT_END ++}; ++ ++void fuse_lowlevel_version(void) ++{ ++ printf("using FUSE kernel interface version %i.%i\n", ++ FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); ++ fuse_mount_version(); ++} ++ ++void fuse_lowlevel_help(void) ++{ ++ /* These are not all options, but the ones that are ++ potentially of interest to an end-user */ ++ printf( ++" -o allow_other allow access by all users\n" ++" -o allow_root allow access by root\n" ++" -o auto_unmount auto unmount on process termination\n"); ++} ++ ++void fuse_session_destroy(struct fuse_session *se) ++{ ++ struct fuse_ll_pipe *llp; ++ ++ if (se->got_init && !se->got_destroy) { ++ if (se->op.destroy) ++ se->op.destroy(se->userdata); ++ } ++ llp = pthread_getspecific(se->pipe_key); ++ if (llp != NULL) ++ fuse_ll_pipe_free(llp); ++ pthread_key_delete(se->pipe_key); ++ pthread_mutex_destroy(&se->lock); ++ free(se->cuse_data); ++ if (se->fd != -1) ++ close(se->fd); ++ destroy_mount_opts(se->mo); ++ free(se); ++} ++ ++ ++static void fuse_ll_pipe_destructor(void *data) ++{ ++ struct fuse_ll_pipe *llp = data; ++ fuse_ll_pipe_free(llp); ++} ++ ++int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf) ++{ ++ return fuse_session_receive_buf_int(se, buf, NULL); ++} ++ ++int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, ++ struct fuse_chan *ch) ++{ ++ int err; ++ ssize_t res; ++#ifdef HAVE_SPLICE ++ size_t bufsize = se->bufsize; ++ struct fuse_ll_pipe *llp; ++ struct fuse_buf tmpbuf; ++ ++ if (se->conn.proto_minor < 14 || !(se->conn.want & FUSE_CAP_SPLICE_READ)) ++ goto fallback; ++ ++ llp = fuse_ll_get_pipe(se); ++ if (llp == NULL) ++ goto fallback; ++ ++ if (llp->size < bufsize) { ++ if (llp->can_grow) { ++ res = fcntl(llp->pipe[0], F_SETPIPE_SZ, bufsize); ++ if (res == -1) { ++ llp->can_grow = 0; ++ res = grow_pipe_to_max(llp->pipe[0]); ++ if (res > 0) ++ llp->size = res; ++ goto fallback; ++ } ++ llp->size = res; ++ } ++ if (llp->size < bufsize) ++ goto fallback; ++ } ++ ++ res = splice(ch ? ch->fd : se->fd, ++ NULL, llp->pipe[1], NULL, bufsize, 0); ++ err = errno; ++ ++ if (fuse_session_exited(se)) ++ return 0; ++ ++ if (res == -1) { ++ if (err == ENODEV) { ++ /* Filesystem was unmounted, or connection was aborted ++ via /sys/fs/fuse/connections */ ++ fuse_session_exit(se); ++ return 0; ++ } ++ if (err != EINTR && err != EAGAIN) ++ perror("fuse: splice from device"); ++ return -err; ++ } ++ ++ if (res < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "short splice from fuse device\n"); ++ return -EIO; ++ } ++ ++ tmpbuf = (struct fuse_buf) { ++ .size = res, ++ .flags = FUSE_BUF_IS_FD, ++ .fd = llp->pipe[0], ++ }; ++ ++ /* ++ * Don't bother with zero copy for small requests. ++ * fuse_loop_mt() needs to check for FORGET so this more than ++ * just an optimization. ++ */ ++ if (res < sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_write_in) + pagesize) { ++ struct fuse_bufvec src = { .buf[0] = tmpbuf, .count = 1 }; ++ struct fuse_bufvec dst = { .count = 1 }; ++ ++ if (!buf->mem) { ++ buf->mem = malloc(se->bufsize); ++ if (!buf->mem) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: failed to allocate read buffer\n"); ++ return -ENOMEM; ++ } ++ } ++ buf->size = se->bufsize; ++ buf->flags = 0; ++ dst.buf[0] = *buf; ++ ++ res = fuse_buf_copy(&dst, &src, 0); ++ if (res < 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", ++ strerror(-res)); ++ fuse_ll_clear_pipe(se); ++ return res; ++ } ++ if (res < tmpbuf.size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); ++ fuse_ll_clear_pipe(se); ++ return -EIO; ++ } ++ assert(res == tmpbuf.size); ++ ++ } else { ++ /* Don't overwrite buf->mem, as that would cause a leak */ ++ buf->fd = tmpbuf.fd; ++ buf->flags = tmpbuf.flags; ++ } ++ buf->size = tmpbuf.size; ++ ++ return res; ++ ++fallback: ++#endif ++ if (!buf->mem) { ++ buf->mem = malloc(se->bufsize); ++ if (!buf->mem) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: failed to allocate read buffer\n"); ++ return -ENOMEM; ++ } ++ } ++ ++restart: ++ res = read(ch ? ch->fd : se->fd, buf->mem, se->bufsize); ++ err = errno; ++ ++ if (fuse_session_exited(se)) ++ return 0; ++ if (res == -1) { ++ /* ENOENT means the operation was interrupted, it's safe ++ to restart */ ++ if (err == ENOENT) ++ goto restart; ++ ++ if (err == ENODEV) { ++ /* Filesystem was unmounted, or connection was aborted ++ via /sys/fs/fuse/connections */ ++ fuse_session_exit(se); ++ return 0; ++ } ++ /* Errors occurring during normal operation: EINTR (read ++ interrupted), EAGAIN (nonblocking I/O), ENODEV (filesystem ++ umounted) */ ++ if (err != EINTR && err != EAGAIN) ++ perror("fuse: reading device"); ++ return -err; ++ } ++ if ((size_t) res < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "short read on fuse device\n"); ++ return -EIO; ++ } ++ ++ buf->size = res; ++ ++ return res; ++} ++ ++struct fuse_session *fuse_session_new(struct fuse_args *args, ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata) ++{ ++ int err; ++ struct fuse_session *se; ++ struct mount_opts *mo; ++ ++ if (sizeof(struct fuse_lowlevel_ops) < op_size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: library too old, some operations may not work\n"); ++ op_size = sizeof(struct fuse_lowlevel_ops); ++ } ++ ++ if (args->argc == 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: empty argv passed to fuse_session_new().\n"); ++ return NULL; ++ } ++ ++ se = (struct fuse_session *) calloc(1, sizeof(struct fuse_session)); ++ if (se == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); ++ goto out1; ++ } ++ se->fd = -1; ++ se->conn.max_write = UINT_MAX; ++ se->conn.max_readahead = UINT_MAX; ++ ++ /* Parse options */ ++ if(fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) ++ goto out2; ++ if(se->deny_others) { ++ /* Allowing access only by root is done by instructing ++ * kernel to allow access by everyone, and then restricting ++ * access to root and mountpoint owner in libfuse. ++ */ ++ // We may be adding the option a second time, but ++ // that doesn't hurt. ++ if(fuse_opt_add_arg(args, "-oallow_other") == -1) ++ goto out2; ++ } ++ mo = parse_mount_opts(args); ++ if (mo == NULL) ++ goto out3; ++ ++ if(args->argc == 1 && ++ args->argv[0][0] == '-') { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: argv[0] looks like an option, but " ++ "will be ignored\n"); ++ } else if (args->argc != 1) { ++ int i; ++ fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); ++ for(i = 1; i < args->argc-1; i++) ++ fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); ++ fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); ++ goto out4; ++ } ++ ++ if (se->debug) ++ fuse_log(FUSE_LOG_DEBUG, "FUSE library version: %s\n", PACKAGE_VERSION); ++ ++ se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + ++ FUSE_BUFFER_HEADER_SIZE; ++ ++ list_init_req(&se->list); ++ list_init_req(&se->interrupts); ++ list_init_nreq(&se->notify_list); ++ se->notify_ctr = 1; ++ fuse_mutex_init(&se->lock); ++ ++ err = pthread_key_create(&se->pipe_key, fuse_ll_pipe_destructor); ++ if (err) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to create thread specific key: %s\n", ++ strerror(err)); ++ goto out5; ++ } ++ ++ memcpy(&se->op, op, op_size); ++ se->owner = getuid(); ++ se->userdata = userdata; ++ ++ se->mo = mo; ++ return se; ++ ++out5: ++ pthread_mutex_destroy(&se->lock); ++out4: ++ fuse_opt_free_args(args); ++out3: ++ free(mo); ++out2: ++ free(se); ++out1: ++ return NULL; ++} ++ ++int fuse_session_mount(struct fuse_session *se, const char *mountpoint) ++{ ++ int fd; ++ ++ /* ++ * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos ++ * would ensue. ++ */ ++ do { ++ fd = open("/dev/null", O_RDWR); ++ if (fd > 2) ++ close(fd); ++ } while (fd >= 0 && fd <= 2); ++ ++ /* ++ * To allow FUSE daemons to run without privileges, the caller may open ++ * /dev/fuse before launching the file system and pass on the file ++ * descriptor by specifying /dev/fd/N as the mount point. Note that the ++ * parent process takes care of performing the mount in this case. ++ */ ++ fd = fuse_mnt_parse_fuse_fd(mountpoint); ++ if (fd != -1) { ++ if (fcntl(fd, F_GETFD) == -1) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: Invalid file descriptor /dev/fd/%u\n", ++ fd); ++ return -1; ++ } ++ se->fd = fd; ++ return 0; ++ } ++ ++ /* Open channel */ ++ fd = fuse_kern_mount(mountpoint, se->mo); ++ if (fd == -1) ++ return -1; ++ se->fd = fd; ++ ++ /* Save mountpoint */ ++ se->mountpoint = strdup(mountpoint); ++ if (se->mountpoint == NULL) ++ goto error_out; ++ ++ return 0; ++ ++error_out: ++ fuse_kern_unmount(mountpoint, fd); ++ return -1; ++} ++ ++int fuse_session_fd(struct fuse_session *se) ++{ ++ return se->fd; ++} ++ ++void fuse_session_unmount(struct fuse_session *se) ++{ ++ if (se->mountpoint != NULL) { ++ fuse_kern_unmount(se->mountpoint, se->fd); ++ free(se->mountpoint); ++ se->mountpoint = NULL; ++ } ++} ++ ++#ifdef linux ++int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) ++{ ++ char *buf; ++ size_t bufsize = 1024; ++ char path[128]; ++ int ret; ++ int fd; ++ unsigned long pid = req->ctx.pid; ++ char *s; ++ ++ sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); ++ ++retry: ++ buf = malloc(bufsize); ++ if (buf == NULL) ++ return -ENOMEM; ++ ++ ret = -EIO; ++ fd = open(path, O_RDONLY); ++ if (fd == -1) ++ goto out_free; ++ ++ ret = read(fd, buf, bufsize); ++ close(fd); ++ if (ret < 0) { ++ ret = -EIO; ++ goto out_free; ++ } ++ ++ if ((size_t)ret == bufsize) { ++ free(buf); ++ bufsize *= 4; ++ goto retry; ++ } ++ ++ ret = -EIO; ++ s = strstr(buf, "\nGroups:"); ++ if (s == NULL) ++ goto out_free; ++ ++ s += 8; ++ ret = 0; ++ while (1) { ++ char *end; ++ unsigned long val = strtoul(s, &end, 0); ++ if (end == s) ++ break; ++ ++ s = end; ++ if (ret < size) ++ list[ret] = val; ++ ret++; ++ } ++ ++out_free: ++ free(buf); ++ return ret; ++} ++#else /* linux */ ++/* ++ * This is currently not implemented on other than Linux... ++ */ ++int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) ++{ ++ (void) req; (void) size; (void) list; ++ return -ENOSYS; ++} ++#endif ++ ++void fuse_session_exit(struct fuse_session *se) ++{ ++ se->exited = 1; ++} ++ ++void fuse_session_reset(struct fuse_session *se) ++{ ++ se->exited = 0; ++ se->error = 0; ++} ++ ++int fuse_session_exited(struct fuse_session *se) ++{ ++ return se->exited; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-main-virtio-loop.patch b/kvm-virtiofsd-Add-main-virtio-loop.patch new file mode 100755 index 0000000..c0ba96a --- /dev/null +++ b/kvm-virtiofsd-Add-main-virtio-loop.patch @@ -0,0 +1,105 @@ +From 6f413d8b76ff38e5bc01f36515ca71d7fd6e6144 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:58 +0100 +Subject: [PATCH 027/116] virtiofsd: Add main virtio loop +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-24-dgilbert@redhat.com> +Patchwork-id: 93475 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 023/112] virtiofsd: Add main virtio loop +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Processes incoming requests on the vhost-user fd. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 204d8ae57b3c57098642c79b3c03d42495149c09) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 42 +++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 39 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 2ae3c76..1928a20 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -11,12 +11,14 @@ + * See the file COPYING.LIB + */ + ++#include "fuse_virtio.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" +-#include "fuse_virtio.h" + ++#include ++#include + #include + #include + #include +@@ -80,15 +82,49 @@ static const VuDevIface fv_iface = { + .queue_is_processed_in_order = fv_queue_order, + }; + ++/* ++ * Main loop; this mostly deals with events on the vhost-user ++ * socket itself, and not actual fuse data. ++ */ + int virtio_loop(struct fuse_session *se) + { + fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__); + +- while (1) { +- /* TODO: Add stuffing */ ++ while (!fuse_session_exited(se)) { ++ struct pollfd pf[1]; ++ pf[0].fd = se->vu_socketfd; ++ pf[0].events = POLLIN; ++ pf[0].revents = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__); ++ int poll_res = ppoll(pf, 1, NULL, NULL); ++ ++ if (poll_res == -1) { ++ if (errno == EINTR) { ++ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", ++ __func__); ++ continue; ++ } ++ fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n"); ++ break; ++ } ++ assert(poll_res == 1); ++ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { ++ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__, ++ pf[0].revents); ++ break; ++ } ++ assert(pf[0].revents & POLLIN); ++ fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); ++ if (!vu_dispatch(&se->virtio_dev->dev)) { ++ fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); ++ break; ++ } + } + + fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); ++ ++ return 0; + } + + int virtio_session_mount(struct fuse_session *se) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-options-for-virtio.patch b/kvm-virtiofsd-Add-options-for-virtio.patch new file mode 100755 index 0000000..8ac7fa7 --- /dev/null +++ b/kvm-virtiofsd-Add-options-for-virtio.patch @@ -0,0 +1,103 @@ +From 9c1bbe327cf8f88ffc78eed0fce8cdd6f3f006ef Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:54 +0100 +Subject: [PATCH 023/116] virtiofsd: Add options for virtio +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-20-dgilbert@redhat.com> +Patchwork-id: 93473 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 019/112] virtiofsd: Add options for virtio +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Add options to specify parameters for virtio-fs paths, i.e. + + ./virtiofsd -o vhost_user_socket=/tmp/vhostqemu + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 205de006aab8dcbe546a7e3a51d295c2d05e654b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 11 ++++++++--- + tools/virtiofsd/helper.c | 14 +++++++------- + 3 files changed, 16 insertions(+), 10 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index bae0699..26b1a7d 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -63,6 +63,7 @@ struct fuse_session { + struct fuse_notify_req notify_list; + size_t bufsize; + int error; ++ char *vu_socket_path; + }; + + struct fuse_chan { +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 8552cfb..17e8718 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2115,8 +2115,11 @@ reply_err: + } + + static const struct fuse_opt fuse_ll_opts[] = { +- LL_OPTION("debug", debug, 1), LL_OPTION("-d", debug, 1), +- LL_OPTION("--debug", debug, 1), LL_OPTION("allow_root", deny_others, 1), ++ LL_OPTION("debug", debug, 1), ++ LL_OPTION("-d", debug, 1), ++ LL_OPTION("--debug", debug, 1), ++ LL_OPTION("allow_root", deny_others, 1), ++ LL_OPTION("--socket-path=%s", vu_socket_path, 0), + FUSE_OPT_END + }; + +@@ -2132,7 +2135,9 @@ void fuse_lowlevel_help(void) + * These are not all options, but the ones that are + * potentially of interest to an end-user + */ +- printf(" -o allow_root allow access by root\n"); ++ printf( ++ " -o allow_root allow access by root\n" ++ " --socket-path=PATH path for the vhost-user socket\n"); + } + + void fuse_session_destroy(struct fuse_session *se) +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 9333691..676032e 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -127,13 +127,13 @@ static const struct fuse_opt conn_info_opt_spec[] = { + + void fuse_cmdline_help(void) + { +- printf( +- " -h --help print help\n" +- " -V --version print version\n" +- " -d -o debug enable debug output (implies -f)\n" +- " -f foreground operation\n" +- " -o max_idle_threads the maximum number of idle worker threads\n" +- " allowed (default: 10)\n"); ++ printf(" -h --help print help\n" ++ " -V --version print version\n" ++ " -d -o debug enable debug output (implies -f)\n" ++ " -f foreground operation\n" ++ " -o max_idle_threads the maximum number of idle worker " ++ "threads\n" ++ " allowed (default: 10)\n"); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-passthrough_ll.patch b/kvm-virtiofsd-Add-passthrough_ll.patch new file mode 100755 index 0000000..2510551 --- /dev/null +++ b/kvm-virtiofsd-Add-passthrough_ll.patch @@ -0,0 +1,1387 @@ +From 18ef831cac81a6bd2336c73dda357d9d69f8fd25 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:43 +0100 +Subject: [PATCH 012/116] virtiofsd: Add passthrough_ll +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-9-dgilbert@redhat.com> +Patchwork-id: 93462 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 008/112] virtiofsd: Add passthrough_ll +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +passthrough_ll is one of the examples in the upstream fuse project +and is the main part of our daemon here. It passes through requests +from fuse to the underlying filesystem, using syscalls as directly +as possible. + +>From libfuse fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert + Fixed up 'GPL' to 'GPLv2' as per Dan's comments and consistent + with the 'LICENSE' file in libfuse; patch sent to libfuse to fix + it upstream. +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 7c6b66027241f41720240fc6ee1021cdbd975b2e) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 1338 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 1338 insertions(+) + create mode 100644 tools/virtiofsd/passthrough_ll.c + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +new file mode 100644 +index 0000000..e1a6056 +--- /dev/null ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -0,0 +1,1338 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU GPLv2. ++ See the file COPYING. ++*/ ++ ++/** @file ++ * ++ * This file system mirrors the existing file system hierarchy of the ++ * system, starting at the root file system. This is implemented by ++ * just "passing through" all requests to the corresponding user-space ++ * libc functions. In contrast to passthrough.c and passthrough_fh.c, ++ * this implementation uses the low-level API. Its performance should ++ * be the least bad among the three, but many operations are not ++ * implemented. In particular, it is not possible to remove files (or ++ * directories) because the code necessary to defer actual removal ++ * until the file is not opened anymore would make the example much ++ * more complicated. ++ * ++ * When writeback caching is enabled (-o writeback mount option), it ++ * is only possible to write to files for which the mounting user has ++ * read permissions. This is because the writeback cache requires the ++ * kernel to be able to issue read requests for all files (which the ++ * passthrough filesystem cannot satisfy if it can't read the file in ++ * the underlying filesystem). ++ * ++ * Compile with: ++ * ++ * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o passthrough_ll ++ * ++ * ## Source code ## ++ * \include passthrough_ll.c ++ */ ++ ++#define _GNU_SOURCE ++#define FUSE_USE_VERSION 31 ++ ++#include "config.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "passthrough_helpers.h" ++ ++/* We are re-using pointers to our `struct lo_inode` and `struct ++ lo_dirp` elements as inodes. This means that we must be able to ++ store uintptr_t values in a fuse_ino_t variable. The following ++ incantation checks this condition at compile time. */ ++#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++_Static_assert(sizeof(fuse_ino_t) >= sizeof(uintptr_t), ++ "fuse_ino_t too small to hold uintptr_t values!"); ++#else ++struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct \ ++ { unsigned _uintptr_to_must_hold_fuse_ino_t: ++ ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); }; ++#endif ++ ++struct lo_inode { ++ struct lo_inode *next; /* protected by lo->mutex */ ++ struct lo_inode *prev; /* protected by lo->mutex */ ++ int fd; ++ bool is_symlink; ++ ino_t ino; ++ dev_t dev; ++ uint64_t refcount; /* protected by lo->mutex */ ++}; ++ ++enum { ++ CACHE_NEVER, ++ CACHE_NORMAL, ++ CACHE_ALWAYS, ++}; ++ ++struct lo_data { ++ pthread_mutex_t mutex; ++ int debug; ++ int writeback; ++ int flock; ++ int xattr; ++ const char *source; ++ double timeout; ++ int cache; ++ int timeout_set; ++ struct lo_inode root; /* protected by lo->mutex */ ++}; ++ ++static const struct fuse_opt lo_opts[] = { ++ { "writeback", ++ offsetof(struct lo_data, writeback), 1 }, ++ { "no_writeback", ++ offsetof(struct lo_data, writeback), 0 }, ++ { "source=%s", ++ offsetof(struct lo_data, source), 0 }, ++ { "flock", ++ offsetof(struct lo_data, flock), 1 }, ++ { "no_flock", ++ offsetof(struct lo_data, flock), 0 }, ++ { "xattr", ++ offsetof(struct lo_data, xattr), 1 }, ++ { "no_xattr", ++ offsetof(struct lo_data, xattr), 0 }, ++ { "timeout=%lf", ++ offsetof(struct lo_data, timeout), 0 }, ++ { "timeout=", ++ offsetof(struct lo_data, timeout_set), 1 }, ++ { "cache=never", ++ offsetof(struct lo_data, cache), CACHE_NEVER }, ++ { "cache=auto", ++ offsetof(struct lo_data, cache), CACHE_NORMAL }, ++ { "cache=always", ++ offsetof(struct lo_data, cache), CACHE_ALWAYS }, ++ ++ FUSE_OPT_END ++}; ++ ++static struct lo_data *lo_data(fuse_req_t req) ++{ ++ return (struct lo_data *) fuse_req_userdata(req); ++} ++ ++static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) ++{ ++ if (ino == FUSE_ROOT_ID) ++ return &lo_data(req)->root; ++ else ++ return (struct lo_inode *) (uintptr_t) ino; ++} ++ ++static int lo_fd(fuse_req_t req, fuse_ino_t ino) ++{ ++ return lo_inode(req, ino)->fd; ++} ++ ++static bool lo_debug(fuse_req_t req) ++{ ++ return lo_data(req)->debug != 0; ++} ++ ++static void lo_init(void *userdata, ++ struct fuse_conn_info *conn) ++{ ++ struct lo_data *lo = (struct lo_data*) userdata; ++ ++ if(conn->capable & FUSE_CAP_EXPORT_SUPPORT) ++ conn->want |= FUSE_CAP_EXPORT_SUPPORT; ++ ++ if (lo->writeback && ++ conn->capable & FUSE_CAP_WRITEBACK_CACHE) { ++ if (lo->debug) ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); ++ conn->want |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { ++ if (lo->debug) ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); ++ conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ } ++} ++ ++static void lo_getattr(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) ++{ ++ int res; ++ struct stat buf; ++ struct lo_data *lo = lo_data(req); ++ ++ (void) fi; ++ ++ res = fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ fuse_reply_attr(req, &buf, lo->timeout); ++} ++ ++static int utimensat_empty_nofollow(struct lo_inode *inode, ++ const struct timespec *tv) ++{ ++ int res; ++ char procname[64]; ++ ++ if (inode->is_symlink) { ++ res = utimensat(inode->fd, "", tv, ++ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1 && errno == EINVAL) { ++ /* Sorry, no race free way to set times on symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ return utimensat(AT_FDCWD, procname, tv, 0); ++} ++ ++static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, ++ int valid, struct fuse_file_info *fi) ++{ ++ int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ int ifd = inode->fd; ++ int res; ++ ++ if (valid & FUSE_SET_ATTR_MODE) { ++ if (fi) { ++ res = fchmod(fi->fh, attr->st_mode); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = chmod(procname, attr->st_mode); ++ } ++ if (res == -1) ++ goto out_err; ++ } ++ if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { ++ uid_t uid = (valid & FUSE_SET_ATTR_UID) ? ++ attr->st_uid : (uid_t) -1; ++ gid_t gid = (valid & FUSE_SET_ATTR_GID) ? ++ attr->st_gid : (gid_t) -1; ++ ++ res = fchownat(ifd, "", uid, gid, ++ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ goto out_err; ++ } ++ if (valid & FUSE_SET_ATTR_SIZE) { ++ if (fi) { ++ res = ftruncate(fi->fh, attr->st_size); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = truncate(procname, attr->st_size); ++ } ++ if (res == -1) ++ goto out_err; ++ } ++ if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { ++ struct timespec tv[2]; ++ ++ tv[0].tv_sec = 0; ++ tv[1].tv_sec = 0; ++ tv[0].tv_nsec = UTIME_OMIT; ++ tv[1].tv_nsec = UTIME_OMIT; ++ ++ if (valid & FUSE_SET_ATTR_ATIME_NOW) ++ tv[0].tv_nsec = UTIME_NOW; ++ else if (valid & FUSE_SET_ATTR_ATIME) ++ tv[0] = attr->st_atim; ++ ++ if (valid & FUSE_SET_ATTR_MTIME_NOW) ++ tv[1].tv_nsec = UTIME_NOW; ++ else if (valid & FUSE_SET_ATTR_MTIME) ++ tv[1] = attr->st_mtim; ++ ++ if (fi) ++ res = futimens(fi->fh, tv); ++ else ++ res = utimensat_empty_nofollow(inode, tv); ++ if (res == -1) ++ goto out_err; ++ } ++ ++ return lo_getattr(req, ino, fi); ++ ++out_err: ++ saverr = errno; ++ fuse_reply_err(req, saverr); ++} ++ ++static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) ++{ ++ struct lo_inode *p; ++ struct lo_inode *ret = NULL; ++ ++ pthread_mutex_lock(&lo->mutex); ++ for (p = lo->root.next; p != &lo->root; p = p->next) { ++ if (p->ino == st->st_ino && p->dev == st->st_dev) { ++ assert(p->refcount > 0); ++ ret = p; ++ ret->refcount++; ++ break; ++ } ++ } ++ pthread_mutex_unlock(&lo->mutex); ++ return ret; ++} ++ ++static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, ++ struct fuse_entry_param *e) ++{ ++ int newfd; ++ int res; ++ int saverr; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ ++ memset(e, 0, sizeof(*e)); ++ e->attr_timeout = lo->timeout; ++ e->entry_timeout = lo->timeout; ++ ++ newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); ++ if (newfd == -1) ++ goto out_err; ++ ++ res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ goto out_err; ++ ++ inode = lo_find(lo_data(req), &e->attr); ++ if (inode) { ++ close(newfd); ++ newfd = -1; ++ } else { ++ struct lo_inode *prev, *next; ++ ++ saverr = ENOMEM; ++ inode = calloc(1, sizeof(struct lo_inode)); ++ if (!inode) ++ goto out_err; ++ ++ inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ inode->refcount = 1; ++ inode->fd = newfd; ++ inode->ino = e->attr.st_ino; ++ inode->dev = e->attr.st_dev; ++ ++ pthread_mutex_lock(&lo->mutex); ++ prev = &lo->root; ++ next = prev->next; ++ next->prev = inode; ++ inode->next = next; ++ inode->prev = prev; ++ prev->next = inode; ++ pthread_mutex_unlock(&lo->mutex); ++ } ++ e->ino = (uintptr_t) inode; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long) parent, name, (unsigned long long) e->ino); ++ ++ return 0; ++ ++out_err: ++ saverr = errno; ++ if (newfd != -1) ++ close(newfd); ++ return saverr; ++} ++ ++static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) ++{ ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) ++ fuse_reply_err(req, err); ++ else ++ fuse_reply_entry(req, &e); ++} ++ ++static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, ++ const char *name, mode_t mode, dev_t rdev, ++ const char *link) ++{ ++ int res; ++ int saverr; ++ struct lo_inode *dir = lo_inode(req, parent); ++ struct fuse_entry_param e; ++ ++ saverr = ENOMEM; ++ ++ res = mknod_wrapper(dir->fd, name, link, mode, rdev); ++ ++ saverr = errno; ++ if (res == -1) ++ goto out; ++ ++ saverr = lo_do_lookup(req, parent, name, &e); ++ if (saverr) ++ goto out; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long) parent, name, (unsigned long long) e.ino); ++ ++ fuse_reply_entry(req, &e); ++ return; ++ ++out: ++ fuse_reply_err(req, saverr); ++} ++ ++static void lo_mknod(fuse_req_t req, fuse_ino_t parent, ++ const char *name, mode_t mode, dev_t rdev) ++{ ++ lo_mknod_symlink(req, parent, name, mode, rdev, NULL); ++} ++ ++static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode) ++{ ++ lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); ++} ++ ++static void lo_symlink(fuse_req_t req, const char *link, ++ fuse_ino_t parent, const char *name) ++{ ++ lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); ++} ++ ++static int linkat_empty_nofollow(struct lo_inode *inode, int dfd, ++ const char *name) ++{ ++ int res; ++ char procname[64]; ++ ++ if (inode->is_symlink) { ++ res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); ++ if (res == -1 && (errno == ENOENT || errno == EINVAL)) { ++ /* Sorry, no race free way to hard-link a symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); ++} ++ ++static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, ++ const char *name) ++{ ++ int res; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ struct fuse_entry_param e; ++ int saverr; ++ ++ memset(&e, 0, sizeof(struct fuse_entry_param)); ++ e.attr_timeout = lo->timeout; ++ e.entry_timeout = lo->timeout; ++ ++ res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); ++ if (res == -1) ++ goto out_err; ++ ++ res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ goto out_err; ++ ++ pthread_mutex_lock(&lo->mutex); ++ inode->refcount++; ++ pthread_mutex_unlock(&lo->mutex); ++ e.ino = (uintptr_t) inode; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long) parent, name, ++ (unsigned long long) e.ino); ++ ++ fuse_reply_entry(req, &e); ++ return; ++ ++out_err: ++ saverr = errno; ++ fuse_reply_err(req, saverr); ++} ++ ++static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) ++{ ++ int res; ++ ++ res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags) ++{ ++ int res; ++ ++ if (flags) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ res = renameat(lo_fd(req, parent), name, ++ lo_fd(req, newparent), newname); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) ++{ ++ int res; ++ ++ res = unlinkat(lo_fd(req, parent), name, 0); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) ++{ ++ if (!inode) ++ return; ++ ++ pthread_mutex_lock(&lo->mutex); ++ assert(inode->refcount >= n); ++ inode->refcount -= n; ++ if (!inode->refcount) { ++ struct lo_inode *prev, *next; ++ ++ prev = inode->prev; ++ next = inode->next; ++ next->prev = prev; ++ prev->next = next; ++ ++ pthread_mutex_unlock(&lo->mutex); ++ close(inode->fd); ++ free(inode); ++ ++ } else { ++ pthread_mutex_unlock(&lo->mutex); ++ } ++} ++ ++static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", ++ (unsigned long long) ino, ++ (unsigned long long) inode->refcount, ++ (unsigned long long) nlookup); ++ } ++ ++ unref_inode(lo, inode, nlookup); ++} ++ ++static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) ++{ ++ lo_forget_one(req, ino, nlookup); ++ fuse_reply_none(req); ++} ++ ++static void lo_forget_multi(fuse_req_t req, size_t count, ++ struct fuse_forget_data *forgets) ++{ ++ int i; ++ ++ for (i = 0; i < count; i++) ++ lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); ++ fuse_reply_none(req); ++} ++ ++static void lo_readlink(fuse_req_t req, fuse_ino_t ino) ++{ ++ char buf[PATH_MAX + 1]; ++ int res; ++ ++ res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); ++ if (res == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ if (res == sizeof(buf)) ++ return (void) fuse_reply_err(req, ENAMETOOLONG); ++ ++ buf[res] = '\0'; ++ ++ fuse_reply_readlink(req, buf); ++} ++ ++struct lo_dirp { ++ DIR *dp; ++ struct dirent *entry; ++ off_t offset; ++}; ++ ++static struct lo_dirp *lo_dirp(struct fuse_file_info *fi) ++{ ++ return (struct lo_dirp *) (uintptr_t) fi->fh; ++} ++ ++static void lo_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ int error = ENOMEM; ++ struct lo_data *lo = lo_data(req); ++ struct lo_dirp *d; ++ int fd; ++ ++ d = calloc(1, sizeof(struct lo_dirp)); ++ if (d == NULL) ++ goto out_err; ++ ++ fd = openat(lo_fd(req, ino), ".", O_RDONLY); ++ if (fd == -1) ++ goto out_errno; ++ ++ d->dp = fdopendir(fd); ++ if (d->dp == NULL) ++ goto out_errno; ++ ++ d->offset = 0; ++ d->entry = NULL; ++ ++ fi->fh = (uintptr_t) d; ++ if (lo->cache == CACHE_ALWAYS) ++ fi->keep_cache = 1; ++ fuse_reply_open(req, fi); ++ return; ++ ++out_errno: ++ error = errno; ++out_err: ++ if (d) { ++ if (fd != -1) ++ close(fd); ++ free(d); ++ } ++ fuse_reply_err(req, error); ++} ++ ++static int is_dot_or_dotdot(const char *name) ++{ ++ return name[0] == '.' && (name[1] == '\0' || ++ (name[1] == '.' && name[2] == '\0')); ++} ++ ++static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi, int plus) ++{ ++ struct lo_dirp *d = lo_dirp(fi); ++ char *buf; ++ char *p; ++ size_t rem = size; ++ int err; ++ ++ (void) ino; ++ ++ buf = calloc(1, size); ++ if (!buf) { ++ err = ENOMEM; ++ goto error; ++ } ++ p = buf; ++ ++ if (offset != d->offset) { ++ seekdir(d->dp, offset); ++ d->entry = NULL; ++ d->offset = offset; ++ } ++ while (1) { ++ size_t entsize; ++ off_t nextoff; ++ const char *name; ++ ++ if (!d->entry) { ++ errno = 0; ++ d->entry = readdir(d->dp); ++ if (!d->entry) { ++ if (errno) { // Error ++ err = errno; ++ goto error; ++ } else { // End of stream ++ break; ++ } ++ } ++ } ++ nextoff = d->entry->d_off; ++ name = d->entry->d_name; ++ fuse_ino_t entry_ino = 0; ++ if (plus) { ++ struct fuse_entry_param e; ++ if (is_dot_or_dotdot(name)) { ++ e = (struct fuse_entry_param) { ++ .attr.st_ino = d->entry->d_ino, ++ .attr.st_mode = d->entry->d_type << 12, ++ }; ++ } else { ++ err = lo_do_lookup(req, ino, name, &e); ++ if (err) ++ goto error; ++ entry_ino = e.ino; ++ } ++ ++ entsize = fuse_add_direntry_plus(req, p, rem, name, ++ &e, nextoff); ++ } else { ++ struct stat st = { ++ .st_ino = d->entry->d_ino, ++ .st_mode = d->entry->d_type << 12, ++ }; ++ entsize = fuse_add_direntry(req, p, rem, name, ++ &st, nextoff); ++ } ++ if (entsize > rem) { ++ if (entry_ino != 0) ++ lo_forget_one(req, entry_ino, 1); ++ break; ++ } ++ ++ p += entsize; ++ rem -= entsize; ++ ++ d->entry = NULL; ++ d->offset = nextoff; ++ } ++ ++ err = 0; ++error: ++ // If there's an error, we can only signal it if we haven't stored ++ // any entries yet - otherwise we'd end up with wrong lookup ++ // counts for the entries that are already in the buffer. So we ++ // return what we've collected until that point. ++ if (err && rem == size) ++ fuse_reply_err(req, err); ++ else ++ fuse_reply_buf(req, buf, size - rem); ++ free(buf); ++} ++ ++static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi) ++{ ++ lo_do_readdir(req, ino, size, offset, fi, 0); ++} ++ ++static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi) ++{ ++ lo_do_readdir(req, ino, size, offset, fi, 1); ++} ++ ++static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ struct lo_dirp *d = lo_dirp(fi); ++ (void) ino; ++ closedir(d->dp); ++ free(d); ++ fuse_reply_err(req, 0); ++} ++ ++static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, struct fuse_file_info *fi) ++{ ++ int fd; ++ struct lo_data *lo = lo_data(req); ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ ++ fd = openat(lo_fd(req, parent), name, ++ (fi->flags | O_CREAT) & ~O_NOFOLLOW, mode); ++ if (fd == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) ++ fi->direct_io = 1; ++ else if (lo->cache == CACHE_ALWAYS) ++ fi->keep_cache = 1; ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) ++ fuse_reply_err(req, err); ++ else ++ fuse_reply_create(req, &e, fi); ++} ++ ++static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi) ++{ ++ int res; ++ int fd = dirfd(lo_dirp(fi)->dp); ++ (void) ino; ++ if (datasync) ++ res = fdatasync(fd); ++ else ++ res = fsync(fd); ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ int fd; ++ char buf[64]; ++ struct lo_data *lo = lo_data(req); ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ++ ino, fi->flags); ++ ++ /* With writeback cache, kernel may send read requests even ++ when userspace opened write-only */ ++ if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { ++ fi->flags &= ~O_ACCMODE; ++ fi->flags |= O_RDWR; ++ } ++ ++ /* With writeback cache, O_APPEND is handled by the kernel. ++ This breaks atomicity (since the file may change in the ++ underlying filesystem, so that the kernel's idea of the ++ end of the file isn't accurate anymore). In this example, ++ we just accept that. A more rigorous filesystem may want ++ to return an error here */ ++ if (lo->writeback && (fi->flags & O_APPEND)) ++ fi->flags &= ~O_APPEND; ++ ++ sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ fd = open(buf, fi->flags & ~O_NOFOLLOW); ++ if (fd == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) ++ fi->direct_io = 1; ++ else if (lo->cache == CACHE_ALWAYS) ++ fi->keep_cache = 1; ++ fuse_reply_open(req, fi); ++} ++ ++static void lo_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ (void) ino; ++ ++ close(fi->fh); ++ fuse_reply_err(req, 0); ++} ++ ++static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ int res; ++ (void) ino; ++ res = close(dup(fi->fh)); ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi) ++{ ++ int res; ++ (void) ino; ++ if (datasync) ++ res = fdatasync(fi->fh); ++ else ++ res = fsync(fi->fh); ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi) ++{ ++ struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_read(ino=%" PRIu64 ", size=%zd, " ++ "off=%lu)\n", ino, size, (unsigned long) offset); ++ ++ buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ buf.buf[0].fd = fi->fh; ++ buf.buf[0].pos = offset; ++ ++ fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); ++} ++ ++static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_bufvec *in_buf, off_t off, ++ struct fuse_file_info *fi) ++{ ++ (void) ino; ++ ssize_t res; ++ struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); ++ ++ out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ out_buf.buf[0].fd = fi->fh; ++ out_buf.buf[0].pos = off; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ++ ino, out_buf.buf[0].size, (unsigned long) off); ++ ++ res = fuse_buf_copy(&out_buf, in_buf, 0); ++ if(res < 0) ++ fuse_reply_err(req, -res); ++ else ++ fuse_reply_write(req, (size_t) res); ++} ++ ++static void lo_statfs(fuse_req_t req, fuse_ino_t ino) ++{ ++ int res; ++ struct statvfs stbuf; ++ ++ res = fstatvfs(lo_fd(req, ino), &stbuf); ++ if (res == -1) ++ fuse_reply_err(req, errno); ++ else ++ fuse_reply_statfs(req, &stbuf); ++} ++ ++static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, ++ off_t offset, off_t length, struct fuse_file_info *fi) ++{ ++ int err = EOPNOTSUPP; ++ (void) ino; ++ ++#ifdef HAVE_FALLOCATE ++ err = fallocate(fi->fh, mode, offset, length); ++ if (err < 0) ++ err = errno; ++ ++#elif defined(HAVE_POSIX_FALLOCATE) ++ if (mode) { ++ fuse_reply_err(req, EOPNOTSUPP); ++ return; ++ } ++ ++ err = posix_fallocate(fi->fh, offset, length); ++#endif ++ ++ fuse_reply_err(req, err); ++} ++ ++static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ int op) ++{ ++ int res; ++ (void) ino; ++ ++ res = flock(fi->fh, op); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, ++ size_t size) ++{ ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ++ ino, name, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to getxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) ++ goto out_err; ++ ++ ret = getxattr(procname, name, value, size); ++ if (ret == -1) ++ goto out_err; ++ saverr = 0; ++ if (ret == 0) ++ goto out; ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = getxattr(procname, name, NULL, 0); ++ if (ret == -1) ++ goto out_err; ++ ++ fuse_reply_xattr(req, ret); ++ } ++out_free: ++ free(value); ++ return; ++ ++out_err: ++ saverr = errno; ++out: ++ fuse_reply_err(req, saverr); ++ goto out_free; ++} ++ ++static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) ++{ ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ++ ino, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to listxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) ++ goto out_err; ++ ++ ret = listxattr(procname, value, size); ++ if (ret == -1) ++ goto out_err; ++ saverr = 0; ++ if (ret == 0) ++ goto out; ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = listxattr(procname, NULL, 0); ++ if (ret == -1) ++ goto out_err; ++ ++ fuse_reply_xattr(req, ret); ++ } ++out_free: ++ free(value); ++ return; ++ ++out_err: ++ saverr = errno; ++out: ++ fuse_reply_err(req, saverr); ++ goto out_free; ++} ++ ++static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, ++ const char *value, size_t size, int flags) ++{ ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", ++ ino, name, value, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ ret = setxattr(procname, name, value, size, flags); ++ saverr = ret == -1 ? errno : 0; ++ ++out: ++ fuse_reply_err(req, saverr); ++} ++ ++static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) ++{ ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ++ ino, name); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ ret = removexattr(procname, name); ++ saverr = ret == -1 ? errno : 0; ++ ++out: ++ fuse_reply_err(req, saverr); ++} ++ ++#ifdef HAVE_COPY_FILE_RANGE ++static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, ++ struct fuse_file_info *fi_in, ++ fuse_ino_t ino_out, off_t off_out, ++ struct fuse_file_info *fi_out, size_t len, ++ int flags) ++{ ++ ssize_t res; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, size=%zd, flags=0x%x)\n", ++ ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, ++ len, flags); ++ ++ res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, ++ flags); ++ if (res < 0) ++ fuse_reply_err(req, -errno); ++ else ++ fuse_reply_write(req, res); ++} ++#endif ++ ++static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, ++ struct fuse_file_info *fi) ++{ ++ off_t res; ++ ++ (void)ino; ++ res = lseek(fi->fh, off, whence); ++ if (res != -1) ++ fuse_reply_lseek(req, res); ++ else ++ fuse_reply_err(req, errno); ++} ++ ++static struct fuse_lowlevel_ops lo_oper = { ++ .init = lo_init, ++ .lookup = lo_lookup, ++ .mkdir = lo_mkdir, ++ .mknod = lo_mknod, ++ .symlink = lo_symlink, ++ .link = lo_link, ++ .unlink = lo_unlink, ++ .rmdir = lo_rmdir, ++ .rename = lo_rename, ++ .forget = lo_forget, ++ .forget_multi = lo_forget_multi, ++ .getattr = lo_getattr, ++ .setattr = lo_setattr, ++ .readlink = lo_readlink, ++ .opendir = lo_opendir, ++ .readdir = lo_readdir, ++ .readdirplus = lo_readdirplus, ++ .releasedir = lo_releasedir, ++ .fsyncdir = lo_fsyncdir, ++ .create = lo_create, ++ .open = lo_open, ++ .release = lo_release, ++ .flush = lo_flush, ++ .fsync = lo_fsync, ++ .read = lo_read, ++ .write_buf = lo_write_buf, ++ .statfs = lo_statfs, ++ .fallocate = lo_fallocate, ++ .flock = lo_flock, ++ .getxattr = lo_getxattr, ++ .listxattr = lo_listxattr, ++ .setxattr = lo_setxattr, ++ .removexattr = lo_removexattr, ++#ifdef HAVE_COPY_FILE_RANGE ++ .copy_file_range = lo_copy_file_range, ++#endif ++ .lseek = lo_lseek, ++}; ++ ++int main(int argc, char *argv[]) ++{ ++ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); ++ struct fuse_session *se; ++ struct fuse_cmdline_opts opts; ++ struct lo_data lo = { .debug = 0, ++ .writeback = 0 }; ++ int ret = -1; ++ ++ /* Don't mask creation mode, kernel already did that */ ++ umask(0); ++ ++ pthread_mutex_init(&lo.mutex, NULL); ++ lo.root.next = lo.root.prev = &lo.root; ++ lo.root.fd = -1; ++ lo.cache = CACHE_NORMAL; ++ ++ if (fuse_parse_cmdline(&args, &opts) != 0) ++ return 1; ++ if (opts.show_help) { ++ printf("usage: %s [options] \n\n", argv[0]); ++ fuse_cmdline_help(); ++ fuse_lowlevel_help(); ++ ret = 0; ++ goto err_out1; ++ } else if (opts.show_version) { ++ printf("FUSE library version %s\n", fuse_pkgversion()); ++ fuse_lowlevel_version(); ++ ret = 0; ++ goto err_out1; ++ } ++ ++ if(opts.mountpoint == NULL) { ++ printf("usage: %s [options] \n", argv[0]); ++ printf(" %s --help\n", argv[0]); ++ ret = 1; ++ goto err_out1; ++ } ++ ++ if (fuse_opt_parse(&args, &lo, lo_opts, NULL)== -1) ++ return 1; ++ ++ lo.debug = opts.debug; ++ lo.root.refcount = 2; ++ if (lo.source) { ++ struct stat stat; ++ int res; ++ ++ res = lstat(lo.source, &stat); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", ++ lo.source); ++ exit(1); ++ } ++ if (!S_ISDIR(stat.st_mode)) { ++ fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); ++ exit(1); ++ } ++ ++ } else { ++ lo.source = "/"; ++ } ++ lo.root.is_symlink = false; ++ if (!lo.timeout_set) { ++ switch (lo.cache) { ++ case CACHE_NEVER: ++ lo.timeout = 0.0; ++ break; ++ ++ case CACHE_NORMAL: ++ lo.timeout = 1.0; ++ break; ++ ++ case CACHE_ALWAYS: ++ lo.timeout = 86400.0; ++ break; ++ } ++ } else if (lo.timeout < 0) { ++ fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", ++ lo.timeout); ++ exit(1); ++ } ++ ++ lo.root.fd = open(lo.source, O_PATH); ++ if (lo.root.fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", ++ lo.source); ++ exit(1); ++ } ++ ++ se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); ++ if (se == NULL) ++ goto err_out1; ++ ++ if (fuse_set_signal_handlers(se) != 0) ++ goto err_out2; ++ ++ if (fuse_session_mount(se, opts.mountpoint) != 0) ++ goto err_out3; ++ ++ fuse_daemonize(opts.foreground); ++ ++ /* Block until ctrl+c or fusermount -u */ ++ if (opts.singlethread) ++ ret = fuse_session_loop(se); ++ else ++ ret = fuse_session_loop_mt(se, opts.clone_fd); ++ ++ fuse_session_unmount(se); ++err_out3: ++ fuse_remove_signal_handlers(se); ++err_out2: ++ fuse_session_destroy(se); ++err_out1: ++ free(opts.mountpoint); ++ fuse_opt_free_args(&args); ++ ++ if (lo.root.fd >= 0) ++ close(lo.root.fd); ++ ++ return ret ? 1 : 0; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch b/kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch new file mode 100755 index 0000000..cef537a --- /dev/null +++ b/kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch @@ -0,0 +1,73 @@ +From 52e93f2dc499ead339bf808dac3480b369dfadd1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:39 +0100 +Subject: [PATCH 068/116] virtiofsd: Add timestamp to the log with + FUSE_LOG_DEBUG level +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-65-dgilbert@redhat.com> +Patchwork-id: 93517 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 064/112] virtiofsd: Add timestamp to the log with FUSE_LOG_DEBUG level +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +virtiofsd has some threads, so we see a lot of logs with debug option. +It would be useful for debugging if we can see the timestamp. + +Add nano second timestamp, which got by get_clock(), to the log with +FUSE_LOG_DEBUG level if the syslog option isn't set. + +The log is like as: + + # ./virtiofsd -d -o vhost_user_socket=/tmp/vhostqemu0 -o source=/tmp/share0 -o cache=auto + ... + [5365943125463727] [ID: 00000002] fv_queue_thread: Start for queue 0 kick_fd 9 + [5365943125568644] [ID: 00000002] fv_queue_thread: Waiting for Queue 0 event + [5365943125573561] [ID: 00000002] fv_queue_thread: Got queue event on Queue 0 + +Signed-off-by: Masayoshi Mizuma +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 50fb955aa0e6ede929422146936cf68bf1ca876f) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index f08324f..98114a3 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -36,6 +36,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/timer.h" + #include "fuse_virtio.h" + #include "fuse_log.h" + #include "fuse_lowlevel.h" +@@ -2276,7 +2277,13 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + } + + if (current_log_level == FUSE_LOG_DEBUG) { +- localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), fmt); ++ if (!use_syslog) { ++ localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s", ++ get_clock(), syscall(__NR_gettid), fmt); ++ } else { ++ localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), ++ fmt); ++ } + fmt = localfmt; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Clean-up-inodes-on-destroy.patch b/kvm-virtiofsd-Clean-up-inodes-on-destroy.patch new file mode 100755 index 0000000..4713a0d --- /dev/null +++ b/kvm-virtiofsd-Clean-up-inodes-on-destroy.patch @@ -0,0 +1,85 @@ +From 2b921f7162b53204051955228bf99bbed55d2457 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:53 +0100 +Subject: [PATCH 082/116] virtiofsd: Clean up inodes on destroy +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-79-dgilbert@redhat.com> +Patchwork-id: 93532 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 078/112] virtiofsd: Clean up inodes on destroy +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Clear out our inodes and fd's on a 'destroy' - so we get rid +of them if we reboot the guest. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 771b01eb76ff480fee984bd1d21727147cc3e702) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index b176a31..9ed77a1 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1169,6 +1169,25 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + } + ++static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) ++{ ++ struct lo_inode *inode = value; ++ struct lo_data *lo = user_data; ++ ++ inode->refcount = 0; ++ lo_map_remove(&lo->ino_map, inode->fuse_ino); ++ close(inode->fd); ++ ++ return TRUE; ++} ++ ++static void unref_all_inodes(struct lo_data *lo) ++{ ++ pthread_mutex_lock(&lo->mutex); ++ g_hash_table_foreach_remove(lo->inodes, unref_all_inodes_cb, lo); ++ pthread_mutex_unlock(&lo->mutex); ++} ++ + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +@@ -2035,6 +2054,12 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + } + } + ++static void lo_destroy(void *userdata) ++{ ++ struct lo_data *lo = (struct lo_data *)userdata; ++ unref_all_inodes(lo); ++} ++ + static struct fuse_lowlevel_ops lo_oper = { + .init = lo_init, + .lookup = lo_lookup, +@@ -2073,6 +2098,7 @@ static struct fuse_lowlevel_ops lo_oper = { + .copy_file_range = lo_copy_file_range, + #endif + .lseek = lo_lseek, ++ .destroy = lo_destroy, + }; + + /* Print vhost-user.json backend program capabilities */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch b/kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch new file mode 100755 index 0000000..c421365 --- /dev/null +++ b/kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch @@ -0,0 +1,112 @@ +From 24f91062f571ad2dd2ac22db3b7d456a2c8bd2cb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:23 +0100 +Subject: [PATCH 112/116] virtiofsd: Convert lo_destroy to take the lo->mutex + lock itself +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-109-dgilbert@redhat.com> +Patchwork-id: 93563 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 108/112] virtiofsd: Convert lo_destroy to take the lo->mutex lock itself +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +lo_destroy was relying on some implicit knowledge of the locking; +we can avoid this if we create an unref_inode that doesn't take +the lock and then grab it for the whole of the lo_destroy. + +Suggested-by: Vivek Goyal +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit fe4c15798a48143dd6b1f58d2d3cad12206ce211) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 31 +++++++++++++++++-------------- + 1 file changed, 17 insertions(+), 14 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index eb001b9..fc15d61 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1344,14 +1344,13 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + lo_inode_put(lo, &inode); + } + +-static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, +- uint64_t n) ++/* To be called with lo->mutex held */ ++static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + { + if (!inode) { + return; + } + +- pthread_mutex_lock(&lo->mutex); + assert(inode->nlookup >= n); + inode->nlookup -= n; + if (!inode->nlookup) { +@@ -1362,15 +1361,24 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + g_hash_table_destroy(inode->posix_locks); + pthread_mutex_destroy(&inode->plock_mutex); +- pthread_mutex_unlock(&lo->mutex); + + /* Drop our refcount from lo_do_lookup() */ + lo_inode_put(lo, &inode); +- } else { +- pthread_mutex_unlock(&lo->mutex); + } + } + ++static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, ++ uint64_t n) ++{ ++ if (!inode) { ++ return; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ unref_inode(lo, inode, n); ++ pthread_mutex_unlock(&lo->mutex); ++} ++ + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +@@ -2458,13 +2466,7 @@ static void lo_destroy(void *userdata) + { + struct lo_data *lo = (struct lo_data *)userdata; + +- /* +- * Normally lo->mutex must be taken when traversing lo->inodes but +- * lo_destroy() is a serialized request so no races are possible here. +- * +- * In addition, we cannot acquire lo->mutex since unref_inode() takes it +- * too and this would result in a recursive lock. +- */ ++ pthread_mutex_lock(&lo->mutex); + while (true) { + GHashTableIter iter; + gpointer key, value; +@@ -2475,8 +2477,9 @@ static void lo_destroy(void *userdata) + } + + struct lo_inode *inode = value; +- unref_inode_lolocked(lo, inode, inode->nlookup); ++ unref_inode(lo, inode, inode->nlookup); + } ++ pthread_mutex_unlock(&lo->mutex); + } + + static struct fuse_lowlevel_ops lo_oper = { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch b/kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch new file mode 100755 index 0000000..90b6b35 --- /dev/null +++ b/kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch @@ -0,0 +1,72 @@ +From 3ec945ba7c2649cca13cf6070c6365b1262ad1ec Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Fri, 6 Aug 2021 11:58:26 -0400 +Subject: [PATCH 1/2] virtiofsd: Disable remote posix locks by default +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20210806115827.740945-2-mreitz@redhat.com> +Patchwork-id: 101970 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] virtiofsd: Disable remote posix locks by default +Bugzilla: 1967496 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Vivek Goyal + +From: Vivek Goyal + +Right now we enable remote posix locks by default. That means when guest +does a posix lock it sends request to server (virtiofsd). But currently +we only support non-blocking posix lock and return -EOPNOTSUPP for +blocking version. + +This means that existing applications which are doing blocking posix +locks get -EOPNOTSUPP and fail. To avoid this, people have been +running virtiosd with option "-o no_posix_lock". For new users it +is still a surprise and trial and error takes them to this option. + +Given posix lock implementation is not complete in virtiofsd, disable +it by default. This means that posix locks will work with-in applications +in a guest but not across guests. Anyway we don't support sharing +filesystem among different guests yet in virtiofs so this should +not lead to any kind of surprise or regression and will make life +little easier for virtiofs users. + +Reported-by: Aa Aa +Suggested-by: Miklos Szeredi +Signed-off-by: Vivek Goyal +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 88fc107956a5812649e5918e0c092d3f78bb28ad) + +Conflicts: + docs/tools/virtiofsd.rst + We do not have virtiofsd.rst downstream (added upstream in + commit 6a7e2bbee5fa), so I dropped that hunk (which effectively + updated the default value in the man page). + +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index cb0992f2db..b47029da89 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -3001,7 +3001,7 @@ int main(int argc, char *argv[]) + struct lo_data lo = { + .debug = 0, + .writeback = 0, +- .posix_lock = 1, ++ .posix_lock = 0, + .proc_self_fd = -1, + }; + struct lo_map_elem *root_elem; +-- +2.27.0 + diff --git a/kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch b/kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch new file mode 100755 index 0000000..9f198c2 --- /dev/null +++ b/kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch @@ -0,0 +1,176 @@ +From e217ab392e0d4c770ec18dbfbe986771773cb557 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:33 +0100 +Subject: [PATCH 062/116] virtiofsd: Drop CAP_FSETID if client asked for it +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-59-dgilbert@redhat.com> +Patchwork-id: 93513 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 058/112] virtiofsd: Drop CAP_FSETID if client asked for it +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +If client requested killing setuid/setgid bits on file being written, drop +CAP_FSETID capability so that setuid/setgid bits are cleared upon write +automatically. + +pjdfstest chown/12.t needs this. + +Signed-off-by: Vivek Goyal + dgilbert: reworked for libcap-ng +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ee88465224b3aed2596049caa28f86cbe0d5a3d0) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 105 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 105 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 97e7c75..d53cb1e 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -201,6 +201,91 @@ static int load_capng(void) + return 0; + } + ++/* ++ * Helpers for dropping and regaining effective capabilities. Returns 0 ++ * on success, error otherwise ++ */ ++static int drop_effective_cap(const char *cap_name, bool *cap_dropped) ++{ ++ int cap, ret; ++ ++ cap = capng_name_to_capability(cap_name); ++ if (cap < 0) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", ++ cap_name, strerror(errno)); ++ goto out; ++ } ++ ++ if (load_capng()) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); ++ goto out; ++ } ++ ++ /* We dont have this capability in effective set already. */ ++ if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) { ++ ret = 0; ++ goto out; ++ } ++ ++ if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n"); ++ goto out; ++ } ++ ++ if (capng_apply(CAPNG_SELECT_CAPS)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n"); ++ goto out; ++ } ++ ++ ret = 0; ++ if (cap_dropped) { ++ *cap_dropped = true; ++ } ++ ++out: ++ return ret; ++} ++ ++static int gain_effective_cap(const char *cap_name) ++{ ++ int cap; ++ int ret = 0; ++ ++ cap = capng_name_to_capability(cap_name); ++ if (cap < 0) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", ++ cap_name, strerror(errno)); ++ goto out; ++ } ++ ++ if (load_capng()) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); ++ goto out; ++ } ++ ++ if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n"); ++ goto out; ++ } ++ ++ if (capng_apply(CAPNG_SELECT_CAPS)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n"); ++ goto out; ++ } ++ ret = 0; ++ ++out: ++ return ret; ++} ++ + static void lo_map_init(struct lo_map *map) + { + map->elems = NULL; +@@ -1577,6 +1662,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + (void)ino; + ssize_t res; + struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); ++ bool cap_fsetid_dropped = false; + + out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + out_buf.buf[0].fd = lo_fi_fd(req, fi); +@@ -1588,12 +1674,31 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + out_buf.buf[0].size, (unsigned long)off); + } + ++ /* ++ * If kill_priv is set, drop CAP_FSETID which should lead to kernel ++ * clearing setuid/setgid on file. ++ */ ++ if (fi->kill_priv) { ++ res = drop_effective_cap("FSETID", &cap_fsetid_dropped); ++ if (res != 0) { ++ fuse_reply_err(req, res); ++ return; ++ } ++ } ++ + res = fuse_buf_copy(&out_buf, in_buf); + if (res < 0) { + fuse_reply_err(req, -res); + } else { + fuse_reply_write(req, (size_t)res); + } ++ ++ if (cap_fsetid_dropped) { ++ res = gain_effective_cap("FSETID"); ++ if (res) { ++ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); ++ } ++ } + } + + static void lo_statfs(fuse_req_t req, fuse_ino_t ino) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch b/kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch new file mode 100755 index 0000000..152032b --- /dev/null +++ b/kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch @@ -0,0 +1,111 @@ +From 746e07f2d54908296dde64e97e12ea33a35063e0 Mon Sep 17 00:00:00 2001 +From: Vivek Goyal +Date: Tue, 25 Jan 2022 13:51:14 -0500 +Subject: [PATCH] virtiofsd: Drop membership of all supplementary groups + (CVE-2022-0358) + +RH-Author: Dr. David Alan Gilbert +RH-MergeRequest: 106: 8.5.0z non-av; virtiofsd security fix - drop secondary groups +RH-Commit: [1/1] e39df0b31f3c236675262395b94d5c10e8e3073f +RH-Bugzilla: 2048627 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Vivek Goyal + +At the start, drop membership of all supplementary groups. This is +not required. + +If we have membership of "root" supplementary group and when we switch +uid/gid using setresuid/setsgid, we still retain membership of existing +supplemntary groups. And that can allow some operations which are not +normally allowed. + +For example, if root in guest creates a dir as follows. + +$ mkdir -m 03777 test_dir + +This sets SGID on dir as well as allows unprivileged users to write into +this dir. + +And now as unprivileged user open file as follows. + +$ su test +$ fd = open("test_dir/priviledge_id", O_RDWR|O_CREAT|O_EXCL, 02755); + +This will create SGID set executable in test_dir/. + +And that's a problem because now an unpriviliged user can execute it, +get egid=0 and get access to resources owned by "root" group. This is +privilege escalation. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2044863 +Fixes: CVE-2022-0358 +Reported-by: JIETAO XIAO +Suggested-by: Miklos Szeredi +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal +Message-Id: +Signed-off-by: Dr. David Alan Gilbert + dgilbert: Fixed missing {}'s style nit +(cherry picked from commit 449e8171f96a6a944d1f3b7d3627ae059eae21ca) + dgilbert: Minor fixup around #includes on backport +--- + tools/virtiofsd/passthrough_ll.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index b47029da89..578131179c 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + + #include "passthrough_helpers.h" + #include "seccomp.h" +@@ -1058,6 +1059,30 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + #define OURSYS_setresuid SYS_setresuid + #endif + ++static void drop_supplementary_groups(void) ++{ ++ int ret; ++ ++ ret = getgroups(0, NULL); ++ if (ret == -1) { ++ fuse_log(FUSE_LOG_ERR, "getgroups() failed with error=%d:%s\n", ++ errno, strerror(errno)); ++ exit(1); ++ } ++ ++ if (!ret) { ++ return; ++ } ++ ++ /* Drop all supplementary groups. We should not need it */ ++ ret = setgroups(0, NULL); ++ if (ret == -1) { ++ fuse_log(FUSE_LOG_ERR, "setgroups() failed with error=%d:%s\n", ++ errno, strerror(errno)); ++ exit(1); ++ } ++} ++ + /* + * Change to uid/gid of caller so that file is created with + * ownership of caller. +@@ -3010,6 +3035,8 @@ int main(int argc, char *argv[]) + /* Don't mask creation mode, kernel already did that */ + umask(0); + ++ drop_supplementary_groups(); ++ + pthread_mutex_init(&lo.mutex, NULL); + lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); + lo.root.fd = -1; +-- +2.27.0 + diff --git a/kvm-virtiofsd-Fast-path-for-virtio-read.patch b/kvm-virtiofsd-Fast-path-for-virtio-read.patch new file mode 100755 index 0000000..03874ce --- /dev/null +++ b/kvm-virtiofsd-Fast-path-for-virtio-read.patch @@ -0,0 +1,240 @@ +From 7d2efc3e4af15eff57b0c38cff7c81b371a98303 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:06 +0100 +Subject: [PATCH 035/116] virtiofsd: Fast path for virtio read +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-32-dgilbert@redhat.com> +Patchwork-id: 93480 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 031/112] virtiofsd: Fast path for virtio read +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Readv the data straight into the guests buffer. + +Signed-off-by: Dr. David Alan Gilbert +With fix by: +Signed-off-by: Eryu Guan +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit eb49d187ef5134483a34c970bbfece28aaa686a7) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 5 ++ + tools/virtiofsd/fuse_virtio.c | 162 ++++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_virtio.h | 4 + + 3 files changed, 171 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 380d93b..4f4684d 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -475,6 +475,11 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + return fuse_send_msg(se, ch, iov, iov_count); + } + ++ if (fuse_lowlevel_is_virtio(se) && buf->count == 1 && ++ buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) { ++ return virtio_send_data_iov(se, ch, iov, iov_count, buf, len); ++ } ++ + abort(); /* Will have taken vhost path */ + return 0; + } +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index f1adeb6..7e2711b 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -230,6 +230,168 @@ err: + return ret; + } + ++/* ++ * Callback from fuse_send_data_iov_* when it's virtio and the buffer ++ * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK ++ * We need send the iov and then the buffer. ++ * Return 0 on success ++ */ ++int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count, struct fuse_bufvec *buf, ++ size_t len) ++{ ++ int ret = 0; ++ VuVirtqElement *elem; ++ VuVirtq *q; ++ ++ assert(count >= 1); ++ assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); ++ ++ struct fuse_out_header *out = iov[0].iov_base; ++ /* TODO: Endianness! */ ++ ++ size_t iov_len = iov_size(iov, count); ++ size_t tosend_len = iov_len + len; ++ ++ out->len = tosend_len; ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__, ++ count, len, iov_len); ++ ++ /* unique == 0 is notification which we don't support */ ++ assert(out->unique); ++ ++ /* For virtio we always have ch */ ++ assert(ch); ++ assert(!ch->qi->reply_sent); ++ elem = ch->qi->qe; ++ q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ ++ /* The 'in' part of the elem is to qemu */ ++ unsigned int in_num = elem->in_num; ++ struct iovec *in_sg = elem->in_sg; ++ size_t in_len = iov_size(in_sg, in_num); ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", ++ __func__, elem->index, in_num, in_len); ++ ++ /* ++ * The elem should have room for a 'fuse_out_header' (out from fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (in_len < sizeof(struct fuse_out_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", ++ __func__, elem->index); ++ ret = E2BIG; ++ goto err; ++ } ++ if (in_len < tosend_len) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", ++ __func__, elem->index, tosend_len); ++ ret = E2BIG; ++ goto err; ++ } ++ ++ /* TODO: Limit to 'len' */ ++ ++ /* First copy the header data from iov->in_sg */ ++ copy_iov(iov, count, in_sg, in_num, iov_len); ++ ++ /* ++ * Build a copy of the the in_sg iov so we can skip bits in it, ++ * including changing the offsets ++ */ ++ struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num); ++ assert(in_sg_cpy); ++ memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num); ++ /* These get updated as we skip */ ++ struct iovec *in_sg_ptr = in_sg_cpy; ++ int in_sg_cpy_count = in_num; ++ ++ /* skip over parts of in_sg that contained the header iov */ ++ size_t skip_size = iov_len; ++ ++ size_t in_sg_left = 0; ++ do { ++ while (skip_size != 0 && in_sg_cpy_count) { ++ if (skip_size >= in_sg_ptr[0].iov_len) { ++ skip_size -= in_sg_ptr[0].iov_len; ++ in_sg_ptr++; ++ in_sg_cpy_count--; ++ } else { ++ in_sg_ptr[0].iov_len -= skip_size; ++ in_sg_ptr[0].iov_base += skip_size; ++ break; ++ } ++ } ++ ++ int i; ++ for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) { ++ in_sg_left += in_sg_ptr[i].iov_len; ++ } ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: after skip skip_size=%zd in_sg_cpy_count=%d " ++ "in_sg_left=%zd\n", ++ __func__, skip_size, in_sg_cpy_count, in_sg_left); ++ ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count, ++ buf->buf[0].pos); ++ ++ if (ret == -1) { ++ ret = errno; ++ fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n", ++ __func__, len); ++ free(in_sg_cpy); ++ goto err; ++ } ++ fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__, ++ ret, len); ++ if (ret < len && ret) { ++ fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__); ++ /* Skip over this much next time around */ ++ skip_size = ret; ++ buf->buf[0].pos += ret; ++ len -= ret; ++ ++ /* Lets do another read */ ++ continue; ++ } ++ if (!ret) { ++ /* EOF case? */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__, ++ in_sg_left); ++ break; ++ } ++ if (ret != len) { ++ fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__); ++ ret = EIO; ++ free(in_sg_cpy); ++ goto err; ++ } ++ in_sg_left -= ret; ++ len -= ret; ++ } while (in_sg_left); ++ free(in_sg_cpy); ++ ++ /* Need to fix out->len on EOF */ ++ if (len) { ++ struct fuse_out_header *out_sg = in_sg[0].iov_base; ++ ++ tosend_len -= len; ++ out_sg->len = tosend_len; ++ } ++ ++ ret = 0; ++ ++ vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); ++ vu_queue_notify(&se->virtio_dev->dev, q); ++ ++err: ++ if (ret == 0) { ++ ch->qi->reply_sent = true; ++ } ++ ++ return ret; ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index 135a148..cc676b9 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -26,4 +26,8 @@ int virtio_loop(struct fuse_session *se); + int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count); + ++int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count, ++ struct fuse_bufvec *buf, size_t len); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch b/kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch new file mode 100755 index 0000000..12bb9a2 --- /dev/null +++ b/kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch @@ -0,0 +1,164 @@ +From 6d41fc549198e140f38fddcb02975098df040ae1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:50 +0100 +Subject: [PATCH 019/116] virtiofsd: Fix common header and define for QEMU + builds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-16-dgilbert@redhat.com> +Patchwork-id: 93470 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 015/112] virtiofsd: Fix common header and define for QEMU builds +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +All of the fuse files include config.h and define GNU_SOURCE +where we don't have either under our build - remove them. +Fixup path to the kernel's fuse.h in the QEMUs world. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 09863ebc7e32a107235b3c815ad54d26cc64f07a) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 4 +--- + tools/virtiofsd/fuse_i.h | 3 +++ + tools/virtiofsd/fuse_log.c | 1 + + tools/virtiofsd/fuse_lowlevel.c | 6 ++---- + tools/virtiofsd/fuse_opt.c | 2 +- + tools/virtiofsd/fuse_signals.c | 2 +- + tools/virtiofsd/helper.c | 1 + + tools/virtiofsd/passthrough_ll.c | 8 ++------ + 8 files changed, 12 insertions(+), 15 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 4d507f3..772efa9 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -9,9 +9,7 @@ + * See the file COPYING.LIB + */ + +-#define _GNU_SOURCE +- +-#include "config.h" ++#include "qemu/osdep.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + #include +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index e63cb58..bae0699 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -6,6 +6,9 @@ + * See the file COPYING.LIB + */ + ++#define FUSE_USE_VERSION 31 ++ ++ + #include "fuse.h" + #include "fuse_lowlevel.h" + +diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c +index 11345f9..c301ff6 100644 +--- a/tools/virtiofsd/fuse_log.c ++++ b/tools/virtiofsd/fuse_log.c +@@ -8,6 +8,7 @@ + * See the file COPYING.LIB + */ + ++#include "qemu/osdep.h" + #include "fuse_log.h" + + #include +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 3da80de..07fb8a6 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -9,11 +9,9 @@ + * See the file COPYING.LIB + */ + +-#define _GNU_SOURCE +- +-#include "config.h" ++#include "qemu/osdep.h" + #include "fuse_i.h" +-#include "fuse_kernel.h" ++#include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" + +diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c +index edd36f4..2892236 100644 +--- a/tools/virtiofsd/fuse_opt.c ++++ b/tools/virtiofsd/fuse_opt.c +@@ -9,8 +9,8 @@ + * See the file COPYING.LIB + */ + ++#include "qemu/osdep.h" + #include "fuse_opt.h" +-#include "config.h" + #include "fuse_i.h" + #include "fuse_misc.h" + +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +index 19d6791..dc7c8ac 100644 +--- a/tools/virtiofsd/fuse_signals.c ++++ b/tools/virtiofsd/fuse_signals.c +@@ -8,7 +8,7 @@ + * See the file COPYING.LIB + */ + +-#include "config.h" ++#include "qemu/osdep.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index d9227d7..9333691 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -10,6 +10,7 @@ + * See the file COPYING.LIB. + */ + ++#include "qemu/osdep.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + #include "fuse_misc.h" +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 126a56c..322a889 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -35,15 +35,11 @@ + * \include passthrough_ll.c + */ + +-#define _GNU_SOURCE +-#define FUSE_USE_VERSION 31 +- +-#include "config.h" +- ++#include "qemu/osdep.h" ++#include "fuse_lowlevel.h" + #include + #include + #include +-#include + #include + #include + #include +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch b/kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch new file mode 100755 index 0000000..f929bab --- /dev/null +++ b/kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch @@ -0,0 +1,136 @@ +From 9b5fbc95a287b2ce9448142194b161d8360d5e4e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:15 +0100 +Subject: [PATCH 104/116] virtiofsd: Fix data corruption with O_APPEND write in + writeback mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-101-dgilbert@redhat.com> +Patchwork-id: 93556 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 100/112] virtiofsd: Fix data corruption with O_APPEND write in writeback mode +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Misono Tomohiro + +When writeback mode is enabled (-o writeback), O_APPEND handling is +done in kernel. Therefore virtiofsd clears O_APPEND flag when open. +Otherwise O_APPEND flag takes precedence over pwrite() and write +data may corrupt. + +Currently clearing O_APPEND flag is done in lo_open(), but we also +need the same operation in lo_create(). So, factor out the flag +update operation in lo_open() to update_open_flags() and call it +in both lo_open() and lo_create(). + +This fixes the failure of xfstest generic/069 in writeback mode +(which tests O_APPEND write data integrity). + +Signed-off-by: Misono Tomohiro +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8e4e41e39eac5ee5f378d66f069a2f70a1734317) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 66 ++++++++++++++++++++-------------------- + 1 file changed, 33 insertions(+), 33 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 948cb19..4c61ac5 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1692,6 +1692,37 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + fuse_reply_err(req, 0); + } + ++static void update_open_flags(int writeback, struct fuse_file_info *fi) ++{ ++ /* ++ * With writeback cache, kernel may send read requests even ++ * when userspace opened write-only ++ */ ++ if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { ++ fi->flags &= ~O_ACCMODE; ++ fi->flags |= O_RDWR; ++ } ++ ++ /* ++ * With writeback cache, O_APPEND is handled by the kernel. ++ * This breaks atomicity (since the file may change in the ++ * underlying filesystem, so that the kernel's idea of the ++ * end of the file isn't accurate anymore). In this example, ++ * we just accept that. A more rigorous filesystem may want ++ * to return an error here ++ */ ++ if (writeback && (fi->flags & O_APPEND)) { ++ fi->flags &= ~O_APPEND; ++ } ++ ++ /* ++ * O_DIRECT in guest should not necessarily mean bypassing page ++ * cache on host as well. If somebody needs that behavior, it ++ * probably should be a configuration knob in daemon. ++ */ ++ fi->flags &= ~O_DIRECT; ++} ++ + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) + { +@@ -1721,12 +1752,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out; + } + +- /* +- * O_DIRECT in guest should not necessarily mean bypassing page +- * cache on host as well. If somebody needs that behavior, it +- * probably should be a configuration knob in daemon. +- */ +- fi->flags &= ~O_DIRECT; ++ update_open_flags(lo->writeback, fi); + + fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); +@@ -1936,33 +1962,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, + fi->flags); + +- /* +- * With writeback cache, kernel may send read requests even +- * when userspace opened write-only +- */ +- if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { +- fi->flags &= ~O_ACCMODE; +- fi->flags |= O_RDWR; +- } +- +- /* +- * With writeback cache, O_APPEND is handled by the kernel. +- * This breaks atomicity (since the file may change in the +- * underlying filesystem, so that the kernel's idea of the +- * end of the file isn't accurate anymore). In this example, +- * we just accept that. A more rigorous filesystem may want +- * to return an error here +- */ +- if (lo->writeback && (fi->flags & O_APPEND)) { +- fi->flags &= ~O_APPEND; +- } +- +- /* +- * O_DIRECT in guest should not necessarily mean bypassing page +- * cache on host as well. If somebody needs that behavior, it +- * probably should be a configuration knob in daemon. +- */ +- fi->flags &= ~O_DIRECT; ++ update_open_flags(lo->writeback, fi); + + sprintf(buf, "%i", lo_fd(req, ino)); + fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch b/kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch new file mode 100755 index 0000000..306c183 --- /dev/null +++ b/kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch @@ -0,0 +1,120 @@ +From 9f726593bc3acbc247876dcc4d79fbf046958003 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:49 +0100 +Subject: [PATCH 018/116] virtiofsd: Fix fuse_daemonize ignored return values +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-15-dgilbert@redhat.com> +Patchwork-id: 93469 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 014/112] virtiofsd: Fix fuse_daemonize ignored return values +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +QEMU's compiler enables warnings/errors for ignored values +and the (void) trick used in the fuse code isn't enough. +Turn all the return values into a return value on the function. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 30d8e49760712d65697ea517c53671bd1d214fc7) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 33 ++++++++++++++++++++++----------- + 1 file changed, 22 insertions(+), 11 deletions(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5e6f205..d9227d7 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -10,12 +10,10 @@ + * See the file COPYING.LIB. + */ + +-#include "config.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + #include "fuse_misc.h" + #include "fuse_opt.h" +-#include "mount_util.h" + + #include + #include +@@ -171,6 +169,7 @@ int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + + int fuse_daemonize(int foreground) + { ++ int ret = 0, rett; + if (!foreground) { + int nullfd; + int waiter[2]; +@@ -192,8 +191,8 @@ int fuse_daemonize(int foreground) + case 0: + break; + default: +- (void)read(waiter[0], &completed, sizeof(completed)); +- _exit(0); ++ _exit(read(waiter[0], &completed, ++ sizeof(completed) != sizeof(completed))); + } + + if (setsid() == -1) { +@@ -201,13 +200,22 @@ int fuse_daemonize(int foreground) + return -1; + } + +- (void)chdir("/"); ++ ret = chdir("/"); + + nullfd = open("/dev/null", O_RDWR, 0); + if (nullfd != -1) { +- (void)dup2(nullfd, 0); +- (void)dup2(nullfd, 1); +- (void)dup2(nullfd, 2); ++ rett = dup2(nullfd, 0); ++ if (!ret) { ++ ret = rett; ++ } ++ rett = dup2(nullfd, 1); ++ if (!ret) { ++ ret = rett; ++ } ++ rett = dup2(nullfd, 2); ++ if (!ret) { ++ ret = rett; ++ } + if (nullfd > 2) { + close(nullfd); + } +@@ -215,13 +223,16 @@ int fuse_daemonize(int foreground) + + /* Propagate completion of daemon initialization */ + completed = 1; +- (void)write(waiter[1], &completed, sizeof(completed)); ++ rett = write(waiter[1], &completed, sizeof(completed)); ++ if (!ret) { ++ ret = rett; ++ } + close(waiter[0]); + close(waiter[1]); + } else { +- (void)chdir("/"); ++ ret = chdir("/"); + } +- return 0; ++ return ret; + } + + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch b/kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch new file mode 100755 index 0000000..98907a5 --- /dev/null +++ b/kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch @@ -0,0 +1,51 @@ +From 6abfb7b3c37015ff901d11f178bc6900deec2acf Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Fri, 6 Aug 2021 11:58:27 -0400 +Subject: [PATCH 2/2] virtiofsd: Fix the help message of posix lock +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20210806115827.740945-3-mreitz@redhat.com> +Patchwork-id: 101969 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] virtiofsd: Fix the help message of posix lock +Bugzilla: 1967496 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Vivek Goyal + +From: Jiachen Zhang + +The commit 88fc107956a5812649e5918e0c092d3f78bb28ad disabled remote +posix locks by default. But the --help message still says it is enabled +by default. So fix it to output no_posix_lock. + +Signed-off-by: Jiachen Zhang +Message-Id: <20201027081558.29904-1-zhangjiachen.jaycee@bytedance.com> +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0429eaf518be1d4742356056e6c886b7f9bc9712) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/helper.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5b222ea49b..813d9490e5 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -163,7 +163,7 @@ void fuse_cmdline_help(void) + " default: false\n" + " -o posix_lock|no_posix_lock\n" + " enable/disable remote posix lock\n" +- " default: posix_lock\n" ++ " default: no_posix_lock\n" + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " +-- +2.27.0 + diff --git a/kvm-virtiofsd-Fix-xattr-operations.patch b/kvm-virtiofsd-Fix-xattr-operations.patch new file mode 100755 index 0000000..532948f --- /dev/null +++ b/kvm-virtiofsd-Fix-xattr-operations.patch @@ -0,0 +1,327 @@ +From 8721796f22a8a61d82974088e542377ee6db209e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:14 +0000 +Subject: [PATCH 18/18] virtiofsd: Fix xattr operations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-8-dgilbert@redhat.com> +Patchwork-id: 94123 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 7/7] virtiofsd: Fix xattr operations +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: Misono Tomohiro + +Current virtiofsd has problems about xattr operations and +they does not work properly for directory/symlink/special file. + +The fundamental cause is that virtiofsd uses openat() + f...xattr() +systemcalls for xattr operation but we should not open symlink/special +file in the daemon. Therefore the function is restricted. + +Fix this problem by: + 1. during setup of each thread, call unshare(CLONE_FS) + 2. in xattr operations (i.e. lo_getxattr), if inode is not a regular + file or directory, use fchdir(proc_loot_fd) + ...xattr() + + fchdir(root.fd) instead of openat() + f...xattr() + + (Note: for a regular file/directory openat() + f...xattr() + is still used for performance reason) + +With this patch, xfstests generic/062 passes on virtiofs. + +This fix is suggested by Miklos Szeredi and Stefan Hajnoczi. +The original discussion can be found here: + https://www.redhat.com/archives/virtio-fs/2019-October/msg00046.html + +Signed-off-by: Misono Tomohiro +Message-Id: <20200227055927.24566-3-misono.tomohiro@jp.fujitsu.com> +Acked-by: Vivek Goyal +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit bdfd66788349acc43cd3f1298718ad491663cfcc) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_virtio.c | 13 +++++ + tools/virtiofsd/passthrough_ll.c | 105 +++++++++++++++++++++------------------ + tools/virtiofsd/seccomp.c | 6 +++ + 3 files changed, 77 insertions(+), 47 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index dd1c605..3b6d16a 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -426,6 +426,8 @@ err: + return ret; + } + ++static __thread bool clone_fs_called; ++ + /* Process one FVRequest in a thread pool */ + static void fv_queue_worker(gpointer data, gpointer user_data) + { +@@ -441,6 +443,17 @@ static void fv_queue_worker(gpointer data, gpointer user_data) + + assert(se->bufsize > sizeof(struct fuse_in_header)); + ++ if (!clone_fs_called) { ++ int ret; ++ ++ /* unshare FS for xattr operation */ ++ ret = unshare(CLONE_FS); ++ /* should not fail */ ++ assert(ret == 0); ++ ++ clone_fs_called = true; ++ } ++ + /* + * An element contains one request and the space to send our response + * They're spread over multiple descriptors in a scatter/gather set +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 50c7273..9cba3f1 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -123,7 +123,7 @@ struct lo_inode { + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ + +- bool is_symlink; ++ mode_t filetype; + }; + + struct lo_cred { +@@ -695,7 +695,7 @@ static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, + struct lo_inode *parent; + char path[PATH_MAX]; + +- if (inode->is_symlink) { ++ if (S_ISLNK(inode->filetype)) { + res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH); + if (res == -1 && errno == EINVAL) { + /* Sorry, no race free way to set times on symlink. */ +@@ -929,7 +929,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out_err; + } + +- inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ /* cache only filetype */ ++ inode->filetype = (e->attr.st_mode & S_IFMT); + + /* + * One for the caller and one for nlookup (released in +@@ -1139,7 +1140,7 @@ static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, + struct lo_inode *parent; + char path[PATH_MAX]; + +- if (inode->is_symlink) { ++ if (S_ISLNK(inode->filetype)) { + res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); + if (res == -1 && (errno == ENOENT || errno == EINVAL)) { + /* Sorry, no race free way to hard-link a symlink. */ +@@ -2193,12 +2194,6 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", + ino, name, size); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to getxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + if (size) { + value = malloc(size); + if (!value) { +@@ -2207,12 +2202,25 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + } + + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDONLY); +- if (fd < 0) { +- goto out_err; ++ /* ++ * It is not safe to open() non-regular/non-dir files in file server ++ * unless O_PATH is used, so use that method for regular files/dir ++ * only (as it seems giving less performance overhead). ++ * Otherwise, call fchdir() to avoid open(). ++ */ ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } ++ ret = fgetxattr(fd, name, value, size); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = getxattr(procname, name, value, size); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = fgetxattr(fd, name, value, size); + if (ret == -1) { + goto out_err; + } +@@ -2266,12 +2274,6 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, + size); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to listxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + if (size) { + value = malloc(size); + if (!value) { +@@ -2280,12 +2282,19 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + } + + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDONLY); +- if (fd < 0) { +- goto out_err; ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } ++ ret = flistxattr(fd, value, size); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = listxattr(procname, value, size); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = flistxattr(fd, value, size); + if (ret == -1) { + goto out_err; + } +@@ -2339,20 +2348,21 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 + ", name=%s value=%s size=%zd)\n", ino, name, value, size); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDWR); +- if (fd < 0) { +- saverr = errno; +- goto out; ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } ++ ret = fsetxattr(fd, name, value, size, flags); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = setxattr(procname, name, value, size, flags); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + + out: +@@ -2387,20 +2397,21 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, + name); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDWR); +- if (fd < 0) { +- saverr = errno; +- goto out; ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } ++ ret = fremovexattr(fd, name); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = removexattr(procname, name); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; + + out: +@@ -2800,7 +2811,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + exit(1); + } + +- root->is_symlink = false; ++ root->filetype = S_IFDIR; + root->fd = fd; + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +index 2d9d4a7..bd9e7b0 100644 +--- a/tools/virtiofsd/seccomp.c ++++ b/tools/virtiofsd/seccomp.c +@@ -41,6 +41,7 @@ static const int syscall_whitelist[] = { + SCMP_SYS(exit), + SCMP_SYS(exit_group), + SCMP_SYS(fallocate), ++ SCMP_SYS(fchdir), + SCMP_SYS(fchmodat), + SCMP_SYS(fchownat), + SCMP_SYS(fcntl), +@@ -62,7 +63,9 @@ static const int syscall_whitelist[] = { + SCMP_SYS(getpid), + SCMP_SYS(gettid), + SCMP_SYS(gettimeofday), ++ SCMP_SYS(getxattr), + SCMP_SYS(linkat), ++ SCMP_SYS(listxattr), + SCMP_SYS(lseek), + SCMP_SYS(madvise), + SCMP_SYS(mkdirat), +@@ -85,6 +88,7 @@ static const int syscall_whitelist[] = { + SCMP_SYS(recvmsg), + SCMP_SYS(renameat), + SCMP_SYS(renameat2), ++ SCMP_SYS(removexattr), + SCMP_SYS(rt_sigaction), + SCMP_SYS(rt_sigprocmask), + SCMP_SYS(rt_sigreturn), +@@ -98,10 +102,12 @@ static const int syscall_whitelist[] = { + SCMP_SYS(setresuid32), + #endif + SCMP_SYS(set_robust_list), ++ SCMP_SYS(setxattr), + SCMP_SYS(symlinkat), + SCMP_SYS(time), /* Rarely needed, except on static builds */ + SCMP_SYS(tgkill), + SCMP_SYS(unlinkat), ++ SCMP_SYS(unshare), + SCMP_SYS(utimensat), + SCMP_SYS(write), + SCMP_SYS(writev), +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Format-imported-files-to-qemu-style.patch b/kvm-virtiofsd-Format-imported-files-to-qemu-style.patch new file mode 100755 index 0000000..5593a33 --- /dev/null +++ b/kvm-virtiofsd-Format-imported-files-to-qemu-style.patch @@ -0,0 +1,14743 @@ +From e313ab94af558bbc133e7a93b0a6dbff706dd1d8 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:45 +0100 +Subject: [PATCH 014/116] virtiofsd: Format imported files to qemu style +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-11-dgilbert@redhat.com> +Patchwork-id: 93464 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 010/112] virtiofsd: Format imported files to qemu style +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Mostly using a set like: + +indent -nut -i 4 -nlp -br -cs -ce --no-space-after-function-call-names file +clang-format -style=file -i -- file +clang-tidy -fix-errors -checks=readability-braces-around-statements file +clang-format -style=file -i -- file + +With manual cleanups. + +The .clang-format used is below. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed by: Aleksandar Markovic + +Language: Cpp +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false # although we like it, it creates churn +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: false # churn +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None # AlwaysBreakAfterDefinitionReturnType is taken into account +AlwaysBreakBeforeMultilineStrings: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterStruct: false + AfterUnion: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: false +BreakStringLiterals: true +ColumnLimit: 80 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ForEachMacros: [ + 'CPU_FOREACH', + 'CPU_FOREACH_REVERSE', + 'CPU_FOREACH_SAFE', + 'IOMMU_NOTIFIER_FOREACH', + 'QLIST_FOREACH', + 'QLIST_FOREACH_ENTRY', + 'QLIST_FOREACH_RCU', + 'QLIST_FOREACH_SAFE', + 'QLIST_FOREACH_SAFE_RCU', + 'QSIMPLEQ_FOREACH', + 'QSIMPLEQ_FOREACH_SAFE', + 'QSLIST_FOREACH', + 'QSLIST_FOREACH_SAFE', + 'QTAILQ_FOREACH', + 'QTAILQ_FOREACH_REVERSE', + 'QTAILQ_FOREACH_SAFE', + 'QTAILQ_RAW_FOREACH', + 'RAMBLOCK_FOREACH' +] +IncludeCategories: + - Regex: '^"qemu/osdep.h' + Priority: -3 + - Regex: '^"(block|chardev|crypto|disas|exec|fpu|hw|io|libdecnumber|migration|monitor|net|qapi|qemu|qom|standard-headers|sysemu|ui)/' + Priority: -2 + - Regex: '^"(elf.h|qemu-common.h|glib-compat.h|qemu-io.h|trace-tcg.h)' + Priority: -1 + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '$' +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '.*_BEGIN$' # only PREC_BEGIN ? +MacroBlockEnd: '.*_END$' +MaxEmptyLinesToKeep: 2 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInContainerLiterals: true +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Auto +UseTab: Never +... + +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 7387863d033e8028aa09a815736617a7c4490827) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 434 ++-- + tools/virtiofsd/fuse.h | 1572 +++++++------- + tools/virtiofsd/fuse_common.h | 730 +++---- + tools/virtiofsd/fuse_i.h | 121 +- + tools/virtiofsd/fuse_log.c | 38 +- + tools/virtiofsd/fuse_log.h | 32 +- + tools/virtiofsd/fuse_lowlevel.c | 3638 +++++++++++++++++---------------- + tools/virtiofsd/fuse_lowlevel.h | 2392 +++++++++++----------- + tools/virtiofsd/fuse_misc.h | 30 +- + tools/virtiofsd/fuse_opt.c | 659 +++--- + tools/virtiofsd/fuse_opt.h | 79 +- + tools/virtiofsd/fuse_signals.c | 118 +- + tools/virtiofsd/helper.c | 506 ++--- + tools/virtiofsd/passthrough_helpers.h | 33 +- + tools/virtiofsd/passthrough_ll.c | 2061 ++++++++++--------- + 15 files changed, 6382 insertions(+), 6061 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index aefb7db..5df946c 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -1,252 +1,272 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2010 Miklos Szeredi +- +- Functions for dealing with `struct fuse_buf` and `struct +- fuse_bufvec`. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2010 Miklos Szeredi ++ * ++ * Functions for dealing with `struct fuse_buf` and `struct ++ * fuse_bufvec`. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #define _GNU_SOURCE + + #include "config.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" ++#include ++#include + #include + #include +-#include +-#include + + size_t fuse_buf_size(const struct fuse_bufvec *bufv) + { +- size_t i; +- size_t size = 0; +- +- for (i = 0; i < bufv->count; i++) { +- if (bufv->buf[i].size == SIZE_MAX) +- size = SIZE_MAX; +- else +- size += bufv->buf[i].size; +- } +- +- return size; ++ size_t i; ++ size_t size = 0; ++ ++ for (i = 0; i < bufv->count; i++) { ++ if (bufv->buf[i].size == SIZE_MAX) { ++ size = SIZE_MAX; ++ } else { ++ size += bufv->buf[i].size; ++ } ++ } ++ ++ return size; + } + + static size_t min_size(size_t s1, size_t s2) + { +- return s1 < s2 ? s1 : s2; ++ return s1 < s2 ? s1 : s2; + } + + static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) + { +- ssize_t res = 0; +- size_t copied = 0; +- +- while (len) { +- if (dst->flags & FUSE_BUF_FD_SEEK) { +- res = pwrite(dst->fd, (char *)src->mem + src_off, len, +- dst->pos + dst_off); +- } else { +- res = write(dst->fd, (char *)src->mem + src_off, len); +- } +- if (res == -1) { +- if (!copied) +- return -errno; +- break; +- } +- if (res == 0) +- break; +- +- copied += res; +- if (!(dst->flags & FUSE_BUF_FD_RETRY)) +- break; +- +- src_off += res; +- dst_off += res; +- len -= res; +- } +- +- return copied; ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (dst->flags & FUSE_BUF_FD_SEEK) { ++ res = pwrite(dst->fd, (char *)src->mem + src_off, len, ++ dst->pos + dst_off); ++ } else { ++ res = write(dst->fd, (char *)src->mem + src_off, len); ++ } ++ if (res == -1) { ++ if (!copied) { ++ return -errno; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ copied += res; ++ if (!(dst->flags & FUSE_BUF_FD_RETRY)) { ++ break; ++ } ++ ++ src_off += res; ++ dst_off += res; ++ len -= res; ++ } ++ ++ return copied; + } + + static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) + { +- ssize_t res = 0; +- size_t copied = 0; +- +- while (len) { +- if (src->flags & FUSE_BUF_FD_SEEK) { +- res = pread(src->fd, (char *)dst->mem + dst_off, len, +- src->pos + src_off); +- } else { +- res = read(src->fd, (char *)dst->mem + dst_off, len); +- } +- if (res == -1) { +- if (!copied) +- return -errno; +- break; +- } +- if (res == 0) +- break; +- +- copied += res; +- if (!(src->flags & FUSE_BUF_FD_RETRY)) +- break; +- +- dst_off += res; +- src_off += res; +- len -= res; +- } +- +- return copied; ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (src->flags & FUSE_BUF_FD_SEEK) { ++ res = pread(src->fd, (char *)dst->mem + dst_off, len, ++ src->pos + src_off); ++ } else { ++ res = read(src->fd, (char *)dst->mem + dst_off, len); ++ } ++ if (res == -1) { ++ if (!copied) { ++ return -errno; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ copied += res; ++ if (!(src->flags & FUSE_BUF_FD_RETRY)) { ++ break; ++ } ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; + } + + static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) + { +- char buf[4096]; +- struct fuse_buf tmp = { +- .size = sizeof(buf), +- .flags = 0, +- }; +- ssize_t res; +- size_t copied = 0; +- +- tmp.mem = buf; +- +- while (len) { +- size_t this_len = min_size(tmp.size, len); +- size_t read_len; +- +- res = fuse_buf_read(&tmp, 0, src, src_off, this_len); +- if (res < 0) { +- if (!copied) +- return res; +- break; +- } +- if (res == 0) +- break; +- +- read_len = res; +- res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); +- if (res < 0) { +- if (!copied) +- return res; +- break; +- } +- if (res == 0) +- break; +- +- copied += res; +- +- if (res < this_len) +- break; +- +- dst_off += res; +- src_off += res; +- len -= res; +- } +- +- return copied; ++ char buf[4096]; ++ struct fuse_buf tmp = { ++ .size = sizeof(buf), ++ .flags = 0, ++ }; ++ ssize_t res; ++ size_t copied = 0; ++ ++ tmp.mem = buf; ++ ++ while (len) { ++ size_t this_len = min_size(tmp.size, len); ++ size_t read_len; ++ ++ res = fuse_buf_read(&tmp, 0, src, src_off, this_len); ++ if (res < 0) { ++ if (!copied) { ++ return res; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ read_len = res; ++ res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); ++ if (res < 0) { ++ if (!copied) { ++ return res; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ copied += res; ++ ++ if (res < this_len) { ++ break; ++ } ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; + } + + static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) + { +- int src_is_fd = src->flags & FUSE_BUF_IS_FD; +- int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; +- +- if (!src_is_fd && !dst_is_fd) { +- char *dstmem = (char *)dst->mem + dst_off; +- char *srcmem = (char *)src->mem + src_off; +- +- if (dstmem != srcmem) { +- if (dstmem + len <= srcmem || srcmem + len <= dstmem) +- memcpy(dstmem, srcmem, len); +- else +- memmove(dstmem, srcmem, len); +- } +- +- return len; +- } else if (!src_is_fd) { +- return fuse_buf_write(dst, dst_off, src, src_off, len); +- } else if (!dst_is_fd) { +- return fuse_buf_read(dst, dst_off, src, src_off, len); +- } else { +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); +- } ++ int src_is_fd = src->flags & FUSE_BUF_IS_FD; ++ int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; ++ ++ if (!src_is_fd && !dst_is_fd) { ++ char *dstmem = (char *)dst->mem + dst_off; ++ char *srcmem = (char *)src->mem + src_off; ++ ++ if (dstmem != srcmem) { ++ if (dstmem + len <= srcmem || srcmem + len <= dstmem) { ++ memcpy(dstmem, srcmem, len); ++ } else { ++ memmove(dstmem, srcmem, len); ++ } ++ } ++ ++ return len; ++ } else if (!src_is_fd) { ++ return fuse_buf_write(dst, dst_off, src, src_off, len); ++ } else if (!dst_is_fd) { ++ return fuse_buf_read(dst, dst_off, src, src_off, len); ++ } else { ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); ++ } + } + + static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv) + { +- if (bufv->idx < bufv->count) +- return &bufv->buf[bufv->idx]; +- else +- return NULL; ++ if (bufv->idx < bufv->count) { ++ return &bufv->buf[bufv->idx]; ++ } else { ++ return NULL; ++ } + } + + static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) + { +- const struct fuse_buf *buf = fuse_bufvec_current(bufv); +- +- bufv->off += len; +- assert(bufv->off <= buf->size); +- if (bufv->off == buf->size) { +- assert(bufv->idx < bufv->count); +- bufv->idx++; +- if (bufv->idx == bufv->count) +- return 0; +- bufv->off = 0; +- } +- return 1; ++ const struct fuse_buf *buf = fuse_bufvec_current(bufv); ++ ++ bufv->off += len; ++ assert(bufv->off <= buf->size); ++ if (bufv->off == buf->size) { ++ assert(bufv->idx < bufv->count); ++ bufv->idx++; ++ if (bufv->idx == bufv->count) { ++ return 0; ++ } ++ bufv->off = 0; ++ } ++ return 1; + } + + ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, +- enum fuse_buf_copy_flags flags) ++ enum fuse_buf_copy_flags flags) + { +- size_t copied = 0; +- +- if (dstv == srcv) +- return fuse_buf_size(dstv); +- +- for (;;) { +- const struct fuse_buf *src = fuse_bufvec_current(srcv); +- const struct fuse_buf *dst = fuse_bufvec_current(dstv); +- size_t src_len; +- size_t dst_len; +- size_t len; +- ssize_t res; +- +- if (src == NULL || dst == NULL) +- break; +- +- src_len = src->size - srcv->off; +- dst_len = dst->size - dstv->off; +- len = min_size(src_len, dst_len); +- +- res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); +- if (res < 0) { +- if (!copied) +- return res; +- break; +- } +- copied += res; +- +- if (!fuse_bufvec_advance(srcv, res) || +- !fuse_bufvec_advance(dstv, res)) +- break; +- +- if (res < len) +- break; +- } +- +- return copied; ++ size_t copied = 0; ++ ++ if (dstv == srcv) { ++ return fuse_buf_size(dstv); ++ } ++ ++ for (;;) { ++ const struct fuse_buf *src = fuse_bufvec_current(srcv); ++ const struct fuse_buf *dst = fuse_bufvec_current(dstv); ++ size_t src_len; ++ size_t dst_len; ++ size_t len; ++ ssize_t res; ++ ++ if (src == NULL || dst == NULL) { ++ break; ++ } ++ ++ src_len = src->size - srcv->off; ++ dst_len = dst->size - dstv->off; ++ len = min_size(src_len, dst_len); ++ ++ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); ++ if (res < 0) { ++ if (!copied) { ++ return res; ++ } ++ break; ++ } ++ copied += res; ++ ++ if (!fuse_bufvec_advance(srcv, res) || ++ !fuse_bufvec_advance(dstv, res)) { ++ break; ++ } ++ ++ if (res < len) { ++ break; ++ } ++ } ++ ++ return copied; + } +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +index 3202fba..7a4c713 100644 +--- a/tools/virtiofsd/fuse.h ++++ b/tools/virtiofsd/fuse.h +@@ -1,15 +1,15 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_H_ + #define FUSE_H_ + +-/** @file ++/* + * + * This file defines the library interface of FUSE + * +@@ -19,15 +19,15 @@ + #include "fuse_common.h" + + #include +-#include +-#include + #include + #include ++#include + #include ++#include + +-/* ----------------------------------------------------------- * +- * Basic FUSE API * +- * ----------------------------------------------------------- */ ++/* ++ * Basic FUSE API ++ */ + + /** Handle for a FUSE filesystem */ + struct fuse; +@@ -36,38 +36,39 @@ struct fuse; + * Readdir flags, passed to ->readdir() + */ + enum fuse_readdir_flags { +- /** +- * "Plus" mode. +- * +- * The kernel wants to prefill the inode cache during readdir. The +- * filesystem may honour this by filling in the attributes and setting +- * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also +- * just ignore this flag completely. +- */ +- FUSE_READDIR_PLUS = (1 << 0), ++ /** ++ * "Plus" mode. ++ * ++ * The kernel wants to prefill the inode cache during readdir. The ++ * filesystem may honour this by filling in the attributes and setting ++ * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also ++ * just ignore this flag completely. ++ */ ++ FUSE_READDIR_PLUS = (1 << 0), + }; + + enum fuse_fill_dir_flags { +- /** +- * "Plus" mode: all file attributes are valid +- * +- * The attributes are used by the kernel to prefill the inode cache +- * during a readdir. +- * +- * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set +- * and vice versa. +- */ +- FUSE_FILL_DIR_PLUS = (1 << 1), ++ /** ++ * "Plus" mode: all file attributes are valid ++ * ++ * The attributes are used by the kernel to prefill the inode cache ++ * during a readdir. ++ * ++ * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set ++ * and vice versa. ++ */ ++ FUSE_FILL_DIR_PLUS = (1 << 1), + }; + +-/** Function to add an entry in a readdir() operation ++/** ++ * Function to add an entry in a readdir() operation + * + * The *off* parameter can be any non-zero value that enables the + * filesystem to identify the current point in the directory + * stream. It does not need to be the actual physical position. A + * value of zero is reserved to indicate that seeking in directories + * is not supported. +- * ++ * + * @param buf the buffer passed to the readdir() operation + * @param name the file name of the directory entry + * @param stat file attributes, can be NULL +@@ -75,9 +76,9 @@ enum fuse_fill_dir_flags { + * @param flags fill flags + * @return 1 if buffer is full, zero otherwise + */ +-typedef int (*fuse_fill_dir_t) (void *buf, const char *name, +- const struct stat *stbuf, off_t off, +- enum fuse_fill_dir_flags flags); ++typedef int (*fuse_fill_dir_t)(void *buf, const char *name, ++ const struct stat *stbuf, off_t off, ++ enum fuse_fill_dir_flags flags); + /** + * Configuration of the high-level API + * +@@ -87,186 +88,186 @@ typedef int (*fuse_fill_dir_t) (void *buf, const char *name, + * file system implementation. + */ + struct fuse_config { +- /** +- * If `set_gid` is non-zero, the st_gid attribute of each file +- * is overwritten with the value of `gid`. +- */ +- int set_gid; +- unsigned int gid; +- +- /** +- * If `set_uid` is non-zero, the st_uid attribute of each file +- * is overwritten with the value of `uid`. +- */ +- int set_uid; +- unsigned int uid; +- +- /** +- * If `set_mode` is non-zero, the any permissions bits set in +- * `umask` are unset in the st_mode attribute of each file. +- */ +- int set_mode; +- unsigned int umask; +- +- /** +- * The timeout in seconds for which name lookups will be +- * cached. +- */ +- double entry_timeout; +- +- /** +- * The timeout in seconds for which a negative lookup will be +- * cached. This means, that if file did not exist (lookup +- * retuned ENOENT), the lookup will only be redone after the +- * timeout, and the file/directory will be assumed to not +- * exist until then. A value of zero means that negative +- * lookups are not cached. +- */ +- double negative_timeout; +- +- /** +- * The timeout in seconds for which file/directory attributes +- * (as returned by e.g. the `getattr` handler) are cached. +- */ +- double attr_timeout; +- +- /** +- * Allow requests to be interrupted +- */ +- int intr; +- +- /** +- * Specify which signal number to send to the filesystem when +- * a request is interrupted. The default is hardcoded to +- * USR1. +- */ +- int intr_signal; +- +- /** +- * Normally, FUSE assigns inodes to paths only for as long as +- * the kernel is aware of them. With this option inodes are +- * instead remembered for at least this many seconds. This +- * will require more memory, but may be necessary when using +- * applications that make use of inode numbers. +- * +- * A number of -1 means that inodes will be remembered for the +- * entire life-time of the file-system process. +- */ +- int remember; +- +- /** +- * The default behavior is that if an open file is deleted, +- * the file is renamed to a hidden file (.fuse_hiddenXXX), and +- * only removed when the file is finally released. This +- * relieves the filesystem implementation of having to deal +- * with this problem. This option disables the hiding +- * behavior, and files are removed immediately in an unlink +- * operation (or in a rename operation which overwrites an +- * existing file). +- * +- * It is recommended that you not use the hard_remove +- * option. When hard_remove is set, the following libc +- * functions fail on unlinked files (returning errno of +- * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), +- * ftruncate(2), fstat(2), fchmod(2), fchown(2) +- */ +- int hard_remove; +- +- /** +- * Honor the st_ino field in the functions getattr() and +- * fill_dir(). This value is used to fill in the st_ino field +- * in the stat(2), lstat(2), fstat(2) functions and the d_ino +- * field in the readdir(2) function. The filesystem does not +- * have to guarantee uniqueness, however some applications +- * rely on this value being unique for the whole filesystem. +- * +- * Note that this does *not* affect the inode that libfuse +- * and the kernel use internally (also called the "nodeid"). +- */ +- int use_ino; +- +- /** +- * If use_ino option is not given, still try to fill in the +- * d_ino field in readdir(2). If the name was previously +- * looked up, and is still in the cache, the inode number +- * found there will be used. Otherwise it will be set to -1. +- * If use_ino option is given, this option is ignored. +- */ +- int readdir_ino; +- +- /** +- * This option disables the use of page cache (file content cache) +- * in the kernel for this filesystem. This has several affects: +- * +- * 1. Each read(2) or write(2) system call will initiate one +- * or more read or write operations, data will not be +- * cached in the kernel. +- * +- * 2. The return value of the read() and write() system calls +- * will correspond to the return values of the read and +- * write operations. This is useful for example if the +- * file size is not known in advance (before reading it). +- * +- * Internally, enabling this option causes fuse to set the +- * `direct_io` field of `struct fuse_file_info` - overwriting +- * any value that was put there by the file system. +- */ +- int direct_io; +- +- /** +- * This option disables flushing the cache of the file +- * contents on every open(2). This should only be enabled on +- * filesystems where the file data is never changed +- * externally (not through the mounted FUSE filesystem). Thus +- * it is not suitable for network filesystems and other +- * intermediate filesystems. +- * +- * NOTE: if this option is not specified (and neither +- * direct_io) data is still cached after the open(2), so a +- * read(2) system call will not always initiate a read +- * operation. +- * +- * Internally, enabling this option causes fuse to set the +- * `keep_cache` field of `struct fuse_file_info` - overwriting +- * any value that was put there by the file system. +- */ +- int kernel_cache; +- +- /** +- * This option is an alternative to `kernel_cache`. Instead of +- * unconditionally keeping cached data, the cached data is +- * invalidated on open(2) if if the modification time or the +- * size of the file has changed since it was last opened. +- */ +- int auto_cache; +- +- /** +- * The timeout in seconds for which file attributes are cached +- * for the purpose of checking if auto_cache should flush the +- * file data on open. +- */ +- int ac_attr_timeout_set; +- double ac_attr_timeout; +- +- /** +- * If this option is given the file-system handlers for the +- * following operations will not receive path information: +- * read, write, flush, release, fsync, readdir, releasedir, +- * fsyncdir, lock, ioctl and poll. +- * +- * For the truncate, getattr, chmod, chown and utimens +- * operations the path will be provided only if the struct +- * fuse_file_info argument is NULL. +- */ +- int nullpath_ok; +- +- /** +- * The remaining options are used by libfuse internally and +- * should not be touched. +- */ +- int show_help; +- char *modules; +- int debug; ++ /** ++ * If `set_gid` is non-zero, the st_gid attribute of each file ++ * is overwritten with the value of `gid`. ++ */ ++ int set_gid; ++ unsigned int gid; ++ ++ /** ++ * If `set_uid` is non-zero, the st_uid attribute of each file ++ * is overwritten with the value of `uid`. ++ */ ++ int set_uid; ++ unsigned int uid; ++ ++ /** ++ * If `set_mode` is non-zero, the any permissions bits set in ++ * `umask` are unset in the st_mode attribute of each file. ++ */ ++ int set_mode; ++ unsigned int umask; ++ ++ /** ++ * The timeout in seconds for which name lookups will be ++ * cached. ++ */ ++ double entry_timeout; ++ ++ /** ++ * The timeout in seconds for which a negative lookup will be ++ * cached. This means, that if file did not exist (lookup ++ * retuned ENOENT), the lookup will only be redone after the ++ * timeout, and the file/directory will be assumed to not ++ * exist until then. A value of zero means that negative ++ * lookups are not cached. ++ */ ++ double negative_timeout; ++ ++ /** ++ * The timeout in seconds for which file/directory attributes ++ * (as returned by e.g. the `getattr` handler) are cached. ++ */ ++ double attr_timeout; ++ ++ /** ++ * Allow requests to be interrupted ++ */ ++ int intr; ++ ++ /** ++ * Specify which signal number to send to the filesystem when ++ * a request is interrupted. The default is hardcoded to ++ * USR1. ++ */ ++ int intr_signal; ++ ++ /** ++ * Normally, FUSE assigns inodes to paths only for as long as ++ * the kernel is aware of them. With this option inodes are ++ * instead remembered for at least this many seconds. This ++ * will require more memory, but may be necessary when using ++ * applications that make use of inode numbers. ++ * ++ * A number of -1 means that inodes will be remembered for the ++ * entire life-time of the file-system process. ++ */ ++ int remember; ++ ++ /** ++ * The default behavior is that if an open file is deleted, ++ * the file is renamed to a hidden file (.fuse_hiddenXXX), and ++ * only removed when the file is finally released. This ++ * relieves the filesystem implementation of having to deal ++ * with this problem. This option disables the hiding ++ * behavior, and files are removed immediately in an unlink ++ * operation (or in a rename operation which overwrites an ++ * existing file). ++ * ++ * It is recommended that you not use the hard_remove ++ * option. When hard_remove is set, the following libc ++ * functions fail on unlinked files (returning errno of ++ * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), ++ * ftruncate(2), fstat(2), fchmod(2), fchown(2) ++ */ ++ int hard_remove; ++ ++ /** ++ * Honor the st_ino field in the functions getattr() and ++ * fill_dir(). This value is used to fill in the st_ino field ++ * in the stat(2), lstat(2), fstat(2) functions and the d_ino ++ * field in the readdir(2) function. The filesystem does not ++ * have to guarantee uniqueness, however some applications ++ * rely on this value being unique for the whole filesystem. ++ * ++ * Note that this does *not* affect the inode that libfuse ++ * and the kernel use internally (also called the "nodeid"). ++ */ ++ int use_ino; ++ ++ /** ++ * If use_ino option is not given, still try to fill in the ++ * d_ino field in readdir(2). If the name was previously ++ * looked up, and is still in the cache, the inode number ++ * found there will be used. Otherwise it will be set to -1. ++ * If use_ino option is given, this option is ignored. ++ */ ++ int readdir_ino; ++ ++ /** ++ * This option disables the use of page cache (file content cache) ++ * in the kernel for this filesystem. This has several affects: ++ * ++ * 1. Each read(2) or write(2) system call will initiate one ++ * or more read or write operations, data will not be ++ * cached in the kernel. ++ * ++ * 2. The return value of the read() and write() system calls ++ * will correspond to the return values of the read and ++ * write operations. This is useful for example if the ++ * file size is not known in advance (before reading it). ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `direct_io` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int direct_io; ++ ++ /** ++ * This option disables flushing the cache of the file ++ * contents on every open(2). This should only be enabled on ++ * filesystems where the file data is never changed ++ * externally (not through the mounted FUSE filesystem). Thus ++ * it is not suitable for network filesystems and other ++ * intermediate filesystems. ++ * ++ * NOTE: if this option is not specified (and neither ++ * direct_io) data is still cached after the open(2), so a ++ * read(2) system call will not always initiate a read ++ * operation. ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `keep_cache` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int kernel_cache; ++ ++ /** ++ * This option is an alternative to `kernel_cache`. Instead of ++ * unconditionally keeping cached data, the cached data is ++ * invalidated on open(2) if if the modification time or the ++ * size of the file has changed since it was last opened. ++ */ ++ int auto_cache; ++ ++ /** ++ * The timeout in seconds for which file attributes are cached ++ * for the purpose of checking if auto_cache should flush the ++ * file data on open. ++ */ ++ int ac_attr_timeout_set; ++ double ac_attr_timeout; ++ ++ /** ++ * If this option is given the file-system handlers for the ++ * following operations will not receive path information: ++ * read, write, flush, release, fsync, readdir, releasedir, ++ * fsyncdir, lock, ioctl and poll. ++ * ++ * For the truncate, getattr, chmod, chown and utimens ++ * operations the path will be provided only if the struct ++ * fuse_file_info argument is NULL. ++ */ ++ int nullpath_ok; ++ ++ /** ++ * The remaining options are used by libfuse internally and ++ * should not be touched. ++ */ ++ int show_help; ++ char *modules; ++ int debug; + }; + + +@@ -293,515 +294,535 @@ struct fuse_config { + * Almost all operations take a path which can be of any length. + */ + struct fuse_operations { +- /** Get file attributes. +- * +- * Similar to stat(). The 'st_dev' and 'st_blksize' fields are +- * ignored. The 'st_ino' field is ignored except if the 'use_ino' +- * mount option is given. In that case it is passed to userspace, +- * but libfuse and the kernel will still assign a different +- * inode for internal use (called the "nodeid"). +- * +- * `fi` will always be NULL if the file is not currently open, but +- * may also be NULL if the file is open. +- */ +- int (*getattr) (const char *, struct stat *, struct fuse_file_info *fi); +- +- /** Read the target of a symbolic link +- * +- * The buffer should be filled with a null terminated string. The +- * buffer size argument includes the space for the terminating +- * null character. If the linkname is too long to fit in the +- * buffer, it should be truncated. The return value should be 0 +- * for success. +- */ +- int (*readlink) (const char *, char *, size_t); +- +- /** Create a file node +- * +- * This is called for creation of all non-directory, non-symlink +- * nodes. If the filesystem defines a create() method, then for +- * regular files that will be called instead. +- */ +- int (*mknod) (const char *, mode_t, dev_t); +- +- /** Create a directory +- * +- * Note that the mode argument may not have the type specification +- * bits set, i.e. S_ISDIR(mode) can be false. To obtain the +- * correct directory type bits use mode|S_IFDIR +- * */ +- int (*mkdir) (const char *, mode_t); +- +- /** Remove a file */ +- int (*unlink) (const char *); +- +- /** Remove a directory */ +- int (*rmdir) (const char *); +- +- /** Create a symbolic link */ +- int (*symlink) (const char *, const char *); +- +- /** Rename a file +- * +- * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If +- * RENAME_NOREPLACE is specified, the filesystem must not +- * overwrite *newname* if it exists and return an error +- * instead. If `RENAME_EXCHANGE` is specified, the filesystem +- * must atomically exchange the two files, i.e. both must +- * exist and neither may be deleted. +- */ +- int (*rename) (const char *, const char *, unsigned int flags); +- +- /** Create a hard link to a file */ +- int (*link) (const char *, const char *); +- +- /** Change the permission bits of a file +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- */ +- int (*chmod) (const char *, mode_t, struct fuse_file_info *fi); +- +- /** Change the owner and group of a file +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*chown) (const char *, uid_t, gid_t, struct fuse_file_info *fi); +- +- /** Change the size of a file +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*truncate) (const char *, off_t, struct fuse_file_info *fi); +- +- /** Open a file +- * +- * Open flags are available in fi->flags. The following rules +- * apply. +- * +- * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be +- * filtered out / handled by the kernel. +- * +- * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) +- * should be used by the filesystem to check if the operation is +- * permitted. If the ``-o default_permissions`` mount option is +- * given, this check is already done by the kernel before calling +- * open() and may thus be omitted by the filesystem. +- * +- * - When writeback caching is enabled, the kernel may send +- * read requests even for files opened with O_WRONLY. The +- * filesystem should be prepared to handle this. +- * +- * - When writeback caching is disabled, the filesystem is +- * expected to properly handle the O_APPEND flag and ensure +- * that each write is appending to the end of the file. +- * +- * - When writeback caching is enabled, the kernel will +- * handle O_APPEND. However, unless all changes to the file +- * come through the kernel this will not work reliably. The +- * filesystem should thus either ignore the O_APPEND flag +- * (and let the kernel handle it), or return an error +- * (indicating that reliably O_APPEND is not available). +- * +- * Filesystem may store an arbitrary file handle (pointer, +- * index, etc) in fi->fh, and use this in other all other file +- * operations (read, write, flush, release, fsync). +- * +- * Filesystem may also implement stateless file I/O and not store +- * anything in fi->fh. +- * +- * There are also some flags (direct_io, keep_cache) which the +- * filesystem may set in fi, to change the way the file is opened. +- * See fuse_file_info structure in for more details. +- * +- * If this request is answered with an error code of ENOSYS +- * and FUSE_CAP_NO_OPEN_SUPPORT is set in +- * `fuse_conn_info.capable`, this is treated as success and +- * future calls to open will also succeed without being send +- * to the filesystem process. +- * +- */ +- int (*open) (const char *, struct fuse_file_info *); +- +- /** Read data from an open file +- * +- * Read should return exactly the number of bytes requested except +- * on EOF or error, otherwise the rest of the data will be +- * substituted with zeroes. An exception to this is when the +- * 'direct_io' mount option is specified, in which case the return +- * value of the read system call will reflect the return value of +- * this operation. +- */ +- int (*read) (const char *, char *, size_t, off_t, +- struct fuse_file_info *); +- +- /** Write data to an open file +- * +- * Write should return exactly the number of bytes requested +- * except on error. An exception to this is when the 'direct_io' +- * mount option is specified (see read operation). +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*write) (const char *, const char *, size_t, off_t, +- struct fuse_file_info *); +- +- /** Get file system statistics +- * +- * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored +- */ +- int (*statfs) (const char *, struct statvfs *); +- +- /** Possibly flush cached data +- * +- * BIG NOTE: This is not equivalent to fsync(). It's not a +- * request to sync dirty data. +- * +- * Flush is called on each close() of a file descriptor, as opposed to +- * release which is called on the close of the last file descriptor for +- * a file. Under Linux, errors returned by flush() will be passed to +- * userspace as errors from close(), so flush() is a good place to write +- * back any cached dirty data. However, many applications ignore errors +- * on close(), and on non-Linux systems, close() may succeed even if flush() +- * returns an error. For these reasons, filesystems should not assume +- * that errors returned by flush will ever be noticed or even +- * delivered. +- * +- * NOTE: The flush() method may be called more than once for each +- * open(). This happens if more than one file descriptor refers to an +- * open file handle, e.g. due to dup(), dup2() or fork() calls. It is +- * not possible to determine if a flush is final, so each flush should +- * be treated equally. Multiple write-flush sequences are relatively +- * rare, so this shouldn't be a problem. +- * +- * Filesystems shouldn't assume that flush will be called at any +- * particular point. It may be called more times than expected, or not +- * at all. +- * +- * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html +- */ +- int (*flush) (const char *, struct fuse_file_info *); +- +- /** Release an open file +- * +- * Release is called when there are no more references to an open +- * file: all file descriptors are closed and all memory mappings +- * are unmapped. +- * +- * For every open() call there will be exactly one release() call +- * with the same flags and file handle. It is possible to +- * have a file opened more than once, in which case only the last +- * release will mean, that no more reads/writes will happen on the +- * file. The return value of release is ignored. +- */ +- int (*release) (const char *, struct fuse_file_info *); +- +- /** Synchronize file contents +- * +- * If the datasync parameter is non-zero, then only the user data +- * should be flushed, not the meta data. +- */ +- int (*fsync) (const char *, int, struct fuse_file_info *); +- +- /** Set extended attributes */ +- int (*setxattr) (const char *, const char *, const char *, size_t, int); +- +- /** Get extended attributes */ +- int (*getxattr) (const char *, const char *, char *, size_t); +- +- /** List extended attributes */ +- int (*listxattr) (const char *, char *, size_t); +- +- /** Remove extended attributes */ +- int (*removexattr) (const char *, const char *); +- +- /** Open directory +- * +- * Unless the 'default_permissions' mount option is given, +- * this method should check if opendir is permitted for this +- * directory. Optionally opendir may also return an arbitrary +- * filehandle in the fuse_file_info structure, which will be +- * passed to readdir, releasedir and fsyncdir. +- */ +- int (*opendir) (const char *, struct fuse_file_info *); +- +- /** Read directory +- * +- * The filesystem may choose between two modes of operation: +- * +- * 1) The readdir implementation ignores the offset parameter, and +- * passes zero to the filler function's offset. The filler +- * function will not return '1' (unless an error happens), so the +- * whole directory is read in a single readdir operation. +- * +- * 2) The readdir implementation keeps track of the offsets of the +- * directory entries. It uses the offset parameter and always +- * passes non-zero offset to the filler function. When the buffer +- * is full (or an error happens) the filler function will return +- * '1'. +- */ +- int (*readdir) (const char *, void *, fuse_fill_dir_t, off_t, +- struct fuse_file_info *, enum fuse_readdir_flags); +- +- /** Release directory +- */ +- int (*releasedir) (const char *, struct fuse_file_info *); +- +- /** Synchronize directory contents +- * +- * If the datasync parameter is non-zero, then only the user data +- * should be flushed, not the meta data +- */ +- int (*fsyncdir) (const char *, int, struct fuse_file_info *); +- +- /** +- * Initialize filesystem +- * +- * The return value will passed in the `private_data` field of +- * `struct fuse_context` to all file operations, and as a +- * parameter to the destroy() method. It overrides the initial +- * value provided to fuse_main() / fuse_new(). +- */ +- void *(*init) (struct fuse_conn_info *conn, +- struct fuse_config *cfg); +- +- /** +- * Clean up filesystem +- * +- * Called on filesystem exit. +- */ +- void (*destroy) (void *private_data); +- +- /** +- * Check file access permissions +- * +- * This will be called for the access() system call. If the +- * 'default_permissions' mount option is given, this method is not +- * called. +- * +- * This method is not called under Linux kernel versions 2.4.x +- */ +- int (*access) (const char *, int); +- +- /** +- * Create and open a file +- * +- * If the file does not exist, first create it with the specified +- * mode, and then open it. +- * +- * If this method is not implemented or under Linux kernel +- * versions earlier than 2.6.15, the mknod() and open() methods +- * will be called instead. +- */ +- int (*create) (const char *, mode_t, struct fuse_file_info *); +- +- /** +- * Perform POSIX file locking operation +- * +- * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. +- * +- * For the meaning of fields in 'struct flock' see the man page +- * for fcntl(2). The l_whence field will always be set to +- * SEEK_SET. +- * +- * For checking lock ownership, the 'fuse_file_info->owner' +- * argument must be used. +- * +- * For F_GETLK operation, the library will first check currently +- * held locks, and if a conflicting lock is found it will return +- * information without calling this method. This ensures, that +- * for local locks the l_pid field is correctly filled in. The +- * results may not be accurate in case of race conditions and in +- * the presence of hard links, but it's unlikely that an +- * application would rely on accurate GETLK results in these +- * cases. If a conflicting lock is not found, this method will be +- * called, and the filesystem may fill out l_pid by a meaningful +- * value, or it may leave this field zero. +- * +- * For F_SETLK and F_SETLKW the l_pid field will be set to the pid +- * of the process performing the locking operation. +- * +- * Note: if this method is not implemented, the kernel will still +- * allow file locking to work locally. Hence it is only +- * interesting for network filesystems and similar. +- */ +- int (*lock) (const char *, struct fuse_file_info *, int cmd, +- struct flock *); +- +- /** +- * Change the access and modification times of a file with +- * nanosecond resolution +- * +- * This supersedes the old utime() interface. New applications +- * should use this. +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- * +- * See the utimensat(2) man page for details. +- */ +- int (*utimens) (const char *, const struct timespec tv[2], +- struct fuse_file_info *fi); +- +- /** +- * Map block index within file to block index within device +- * +- * Note: This makes sense only for block device backed filesystems +- * mounted with the 'blkdev' option +- */ +- int (*bmap) (const char *, size_t blocksize, uint64_t *idx); +- +- /** +- * Ioctl +- * +- * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in +- * 64bit environment. The size and direction of data is +- * determined by _IOC_*() decoding of cmd. For _IOC_NONE, +- * data will be NULL, for _IOC_WRITE data is out area, for +- * _IOC_READ in area and if both are set in/out area. In all +- * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. +- * +- * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a +- * directory file handle. +- * +- * Note : the unsigned long request submitted by the application +- * is truncated to 32 bits. +- */ +- int (*ioctl) (const char *, unsigned int cmd, void *arg, +- struct fuse_file_info *, unsigned int flags, void *data); +- +- /** +- * Poll for IO readiness events +- * +- * Note: If ph is non-NULL, the client should notify +- * when IO readiness events occur by calling +- * fuse_notify_poll() with the specified ph. +- * +- * Regardless of the number of times poll with a non-NULL ph +- * is received, single notification is enough to clear all. +- * Notifying more times incurs overhead but doesn't harm +- * correctness. +- * +- * The callee is responsible for destroying ph with +- * fuse_pollhandle_destroy() when no longer in use. +- */ +- int (*poll) (const char *, struct fuse_file_info *, +- struct fuse_pollhandle *ph, unsigned *reventsp); +- +- /** Write contents of buffer to an open file +- * +- * Similar to the write() method, but data is supplied in a +- * generic buffer. Use fuse_buf_copy() to transfer data to +- * the destination. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*write_buf) (const char *, struct fuse_bufvec *buf, off_t off, +- struct fuse_file_info *); +- +- /** Store data from an open file in a buffer +- * +- * Similar to the read() method, but data is stored and +- * returned in a generic buffer. +- * +- * No actual copying of data has to take place, the source +- * file descriptor may simply be stored in the buffer for +- * later data transfer. +- * +- * The buffer must be allocated dynamically and stored at the +- * location pointed to by bufp. If the buffer contains memory +- * regions, they too must be allocated using malloc(). The +- * allocated memory will be freed by the caller. +- */ +- int (*read_buf) (const char *, struct fuse_bufvec **bufp, +- size_t size, off_t off, struct fuse_file_info *); +- /** +- * Perform BSD file locking operation +- * +- * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN +- * +- * Nonblocking requests will be indicated by ORing LOCK_NB to +- * the above operations +- * +- * For more information see the flock(2) manual page. +- * +- * Additionally fi->owner will be set to a value unique to +- * this open file. This same value will be supplied to +- * ->release() when the file is released. +- * +- * Note: if this method is not implemented, the kernel will still +- * allow file locking to work locally. Hence it is only +- * interesting for network filesystems and similar. +- */ +- int (*flock) (const char *, struct fuse_file_info *, int op); +- +- /** +- * Allocates space for an open file +- * +- * This function ensures that required space is allocated for specified +- * file. If this function returns success then any subsequent write +- * request to specified range is guaranteed not to fail because of lack +- * of space on the file system media. +- */ +- int (*fallocate) (const char *, int, off_t, off_t, +- struct fuse_file_info *); +- +- /** +- * Copy a range of data from one file to another +- * +- * Performs an optimized copy between two file descriptors without the +- * additional cost of transferring data through the FUSE kernel module +- * to user space (glibc) and then back into the FUSE filesystem again. +- * +- * In case this method is not implemented, glibc falls back to reading +- * data from the source and writing to the destination. Effectively +- * doing an inefficient copy of the data. +- */ +- ssize_t (*copy_file_range) (const char *path_in, +- struct fuse_file_info *fi_in, +- off_t offset_in, const char *path_out, +- struct fuse_file_info *fi_out, +- off_t offset_out, size_t size, int flags); +- +- /** +- * Find next data or hole after the specified offset +- */ +- off_t (*lseek) (const char *, off_t off, int whence, struct fuse_file_info *); ++ /** ++ * Get file attributes. ++ * ++ * Similar to stat(). The 'st_dev' and 'st_blksize' fields are ++ * ignored. The 'st_ino' field is ignored except if the 'use_ino' ++ * mount option is given. In that case it is passed to userspace, ++ * but libfuse and the kernel will still assign a different ++ * inode for internal use (called the "nodeid"). ++ * ++ * `fi` will always be NULL if the file is not currently open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*getattr)(const char *, struct stat *, struct fuse_file_info *fi); ++ ++ /** ++ * Read the target of a symbolic link ++ * ++ * The buffer should be filled with a null terminated string. The ++ * buffer size argument includes the space for the terminating ++ * null character. If the linkname is too long to fit in the ++ * buffer, it should be truncated. The return value should be 0 ++ * for success. ++ */ ++ int (*readlink)(const char *, char *, size_t); ++ ++ /** ++ * Create a file node ++ * ++ * This is called for creation of all non-directory, non-symlink ++ * nodes. If the filesystem defines a create() method, then for ++ * regular files that will be called instead. ++ */ ++ int (*mknod)(const char *, mode_t, dev_t); ++ ++ /** ++ * Create a directory ++ * ++ * Note that the mode argument may not have the type specification ++ * bits set, i.e. S_ISDIR(mode) can be false. To obtain the ++ * correct directory type bits use mode|S_IFDIR ++ */ ++ int (*mkdir)(const char *, mode_t); ++ ++ /** Remove a file */ ++ int (*unlink)(const char *); ++ ++ /** Remove a directory */ ++ int (*rmdir)(const char *); ++ ++ /** Create a symbolic link */ ++ int (*symlink)(const char *, const char *); ++ ++ /** ++ * Rename a file ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ */ ++ int (*rename)(const char *, const char *, unsigned int flags); ++ ++ /** Create a hard link to a file */ ++ int (*link)(const char *, const char *); ++ ++ /** ++ * Change the permission bits of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*chmod)(const char *, mode_t, struct fuse_file_info *fi); ++ ++ /** ++ * Change the owner and group of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*chown)(const char *, uid_t, gid_t, struct fuse_file_info *fi); ++ ++ /** ++ * Change the size of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*truncate)(const char *, off_t, struct fuse_file_info *fi); ++ ++ /** ++ * Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) ++ * should be used by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount option is ++ * given, this check is already done by the kernel before calling ++ * open() and may thus be omitted by the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open will also succeed without being send ++ * to the filesystem process. ++ * ++ */ ++ int (*open)(const char *, struct fuse_file_info *); ++ ++ /** ++ * Read data from an open file ++ * ++ * Read should return exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the ++ * 'direct_io' mount option is specified, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ */ ++ int (*read)(const char *, char *, size_t, off_t, struct fuse_file_info *); ++ ++ /** ++ * Write data to an open file ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the 'direct_io' ++ * mount option is specified (see read operation). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write)(const char *, const char *, size_t, off_t, ++ struct fuse_file_info *); ++ ++ /** ++ * Get file system statistics ++ * ++ * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored ++ */ ++ int (*statfs)(const char *, struct statvfs *); ++ ++ /** ++ * Possibly flush cached data ++ * ++ * BIG NOTE: This is not equivalent to fsync(). It's not a ++ * request to sync dirty data. ++ * ++ * Flush is called on each close() of a file descriptor, as opposed to ++ * release which is called on the close of the last file descriptor for ++ * a file. Under Linux, errors returned by flush() will be passed to ++ * userspace as errors from close(), so flush() is a good place to write ++ * back any cached dirty data. However, many applications ignore errors ++ * on close(), and on non-Linux systems, close() may succeed even if flush() ++ * returns an error. For these reasons, filesystems should not assume ++ * that errors returned by flush will ever be noticed or even ++ * delivered. ++ * ++ * NOTE: The flush() method may be called more than once for each ++ * open(). This happens if more than one file descriptor refers to an ++ * open file handle, e.g. due to dup(), dup2() or fork() calls. It is ++ * not possible to determine if a flush is final, so each flush should ++ * be treated equally. Multiple write-flush sequences are relatively ++ * rare, so this shouldn't be a problem. ++ * ++ * Filesystems shouldn't assume that flush will be called at any ++ * particular point. It may be called more times than expected, or not ++ * at all. ++ * ++ * [close]: ++ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ int (*flush)(const char *, struct fuse_file_info *); ++ ++ /** ++ * Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open() call there will be exactly one release() call ++ * with the same flags and file handle. It is possible to ++ * have a file opened more than once, in which case only the last ++ * release will mean, that no more reads/writes will happen on the ++ * file. The return value of release is ignored. ++ */ ++ int (*release)(const char *, struct fuse_file_info *); ++ ++ /* ++ * Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ */ ++ int (*fsync)(const char *, int, struct fuse_file_info *); ++ ++ /** Set extended attributes */ ++ int (*setxattr)(const char *, const char *, const char *, size_t, int); ++ ++ /** Get extended attributes */ ++ int (*getxattr)(const char *, const char *, char *, size_t); ++ ++ /** List extended attributes */ ++ int (*listxattr)(const char *, char *, size_t); ++ ++ /** Remove extended attributes */ ++ int (*removexattr)(const char *, const char *); ++ ++ /* ++ * Open directory ++ * ++ * Unless the 'default_permissions' mount option is given, ++ * this method should check if opendir is permitted for this ++ * directory. Optionally opendir may also return an arbitrary ++ * filehandle in the fuse_file_info structure, which will be ++ * passed to readdir, releasedir and fsyncdir. ++ */ ++ int (*opendir)(const char *, struct fuse_file_info *); ++ ++ /* ++ * Read directory ++ * ++ * The filesystem may choose between two modes of operation: ++ * ++ * 1) The readdir implementation ignores the offset parameter, and ++ * passes zero to the filler function's offset. The filler ++ * function will not return '1' (unless an error happens), so the ++ * whole directory is read in a single readdir operation. ++ * ++ * 2) The readdir implementation keeps track of the offsets of the ++ * directory entries. It uses the offset parameter and always ++ * passes non-zero offset to the filler function. When the buffer ++ * is full (or an error happens) the filler function will return ++ * '1'. ++ */ ++ int (*readdir)(const char *, void *, fuse_fill_dir_t, off_t, ++ struct fuse_file_info *, enum fuse_readdir_flags); ++ ++ /** ++ * Release directory ++ */ ++ int (*releasedir)(const char *, struct fuse_file_info *); ++ ++ /** ++ * Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data ++ */ ++ int (*fsyncdir)(const char *, int, struct fuse_file_info *); ++ ++ /** ++ * Initialize filesystem ++ * ++ * The return value will passed in the `private_data` field of ++ * `struct fuse_context` to all file operations, and as a ++ * parameter to the destroy() method. It overrides the initial ++ * value provided to fuse_main() / fuse_new(). ++ */ ++ void *(*init)(struct fuse_conn_info *conn, struct fuse_config *cfg); ++ ++ /** ++ * Clean up filesystem ++ * ++ * Called on filesystem exit. ++ */ ++ void (*destroy)(void *private_data); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() system call. If the ++ * 'default_permissions' mount option is given, this method is not ++ * called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ */ ++ int (*access)(const char *, int); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ */ ++ int (*create)(const char *, mode_t, struct fuse_file_info *); ++ ++ /** ++ * Perform POSIX file locking operation ++ * ++ * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. ++ * ++ * For the meaning of fields in 'struct flock' see the man page ++ * for fcntl(2). The l_whence field will always be set to ++ * SEEK_SET. ++ * ++ * For checking lock ownership, the 'fuse_file_info->owner' ++ * argument must be used. ++ * ++ * For F_GETLK operation, the library will first check currently ++ * held locks, and if a conflicting lock is found it will return ++ * information without calling this method. This ensures, that ++ * for local locks the l_pid field is correctly filled in. The ++ * results may not be accurate in case of race conditions and in ++ * the presence of hard links, but it's unlikely that an ++ * application would rely on accurate GETLK results in these ++ * cases. If a conflicting lock is not found, this method will be ++ * called, and the filesystem may fill out l_pid by a meaningful ++ * value, or it may leave this field zero. ++ * ++ * For F_SETLK and F_SETLKW the l_pid field will be set to the pid ++ * of the process performing the locking operation. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*lock)(const char *, struct fuse_file_info *, int cmd, struct flock *); ++ ++ /** ++ * Change the access and modification times of a file with ++ * nanosecond resolution ++ * ++ * This supersedes the old utime() interface. New applications ++ * should use this. ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * See the utimensat(2) man page for details. ++ */ ++ int (*utimens)(const char *, const struct timespec tv[2], ++ struct fuse_file_info *fi); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ */ ++ int (*bmap)(const char *, size_t blocksize, uint64_t *idx); ++ ++ /** ++ * Ioctl ++ * ++ * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in ++ * 64bit environment. The size and direction of data is ++ * determined by _IOC_*() decoding of cmd. For _IOC_NONE, ++ * data will be NULL, for _IOC_WRITE data is out area, for ++ * _IOC_READ in area and if both are set in/out area. In all ++ * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. ++ * ++ * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a ++ * directory file handle. ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ int (*ioctl)(const char *, unsigned int cmd, void *arg, ++ struct fuse_file_info *, unsigned int flags, void *data); ++ ++ /** ++ * Poll for IO readiness events ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ */ ++ int (*poll)(const char *, struct fuse_file_info *, ++ struct fuse_pollhandle *ph, unsigned *reventsp); ++ ++ /* ++ * Write contents of buffer to an open file ++ * ++ * Similar to the write() method, but data is supplied in a ++ * generic buffer. Use fuse_buf_copy() to transfer data to ++ * the destination. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write_buf)(const char *, struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *); ++ ++ /* ++ * Store data from an open file in a buffer ++ * ++ * Similar to the read() method, but data is stored and ++ * returned in a generic buffer. ++ * ++ * No actual copying of data has to take place, the source ++ * file descriptor may simply be stored in the buffer for ++ * later data transfer. ++ * ++ * The buffer must be allocated dynamically and stored at the ++ * location pointed to by bufp. If the buffer contains memory ++ * regions, they too must be allocated using malloc(). The ++ * allocated memory will be freed by the caller. ++ */ ++ int (*read_buf)(const char *, struct fuse_bufvec **bufp, size_t size, ++ off_t off, struct fuse_file_info *); ++ /** ++ * Perform BSD file locking operation ++ * ++ * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN ++ * ++ * Nonblocking requests will be indicated by ORing LOCK_NB to ++ * the above operations ++ * ++ * For more information see the flock(2) manual page. ++ * ++ * Additionally fi->owner will be set to a value unique to ++ * this open file. This same value will be supplied to ++ * ->release() when the file is released. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*flock)(const char *, struct fuse_file_info *, int op); ++ ++ /** ++ * Allocates space for an open file ++ * ++ * This function ensures that required space is allocated for specified ++ * file. If this function returns success then any subsequent write ++ * request to specified range is guaranteed not to fail because of lack ++ * of space on the file system media. ++ */ ++ int (*fallocate)(const char *, int, off_t, off_t, struct fuse_file_info *); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ */ ++ ssize_t (*copy_file_range)(const char *path_in, ++ struct fuse_file_info *fi_in, off_t offset_in, ++ const char *path_out, ++ struct fuse_file_info *fi_out, off_t offset_out, ++ size_t size, int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ */ ++ off_t (*lseek)(const char *, off_t off, int whence, ++ struct fuse_file_info *); + }; + +-/** Extra context that may be needed by some filesystems ++/* ++ * Extra context that may be needed by some filesystems + * + * The uid, gid and pid fields are not filled in case of a writepage + * operation. + */ + struct fuse_context { +- /** Pointer to the fuse object */ +- struct fuse *fuse; ++ /** Pointer to the fuse object */ ++ struct fuse *fuse; + +- /** User ID of the calling process */ +- uid_t uid; ++ /** User ID of the calling process */ ++ uid_t uid; + +- /** Group ID of the calling process */ +- gid_t gid; ++ /** Group ID of the calling process */ ++ gid_t gid; + +- /** Process ID of the calling thread */ +- pid_t pid; ++ /** Process ID of the calling thread */ ++ pid_t pid; + +- /** Private filesystem data */ +- void *private_data; ++ /** Private filesystem data */ ++ void *private_data; + +- /** Umask of the calling process */ +- mode_t umask; ++ /** Umask of the calling process */ ++ mode_t umask; + }; + + /** +@@ -859,15 +880,15 @@ struct fuse_context { + * Example usage, see hello.c + */ + /* +- int fuse_main(int argc, char *argv[], const struct fuse_operations *op, +- void *private_data); +-*/ +-#define fuse_main(argc, argv, op, private_data) \ +- fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) ++ * int fuse_main(int argc, char *argv[], const struct fuse_operations *op, ++ * void *private_data); ++ */ ++#define fuse_main(argc, argv, op, private_data) \ ++ fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) + +-/* ----------------------------------------------------------- * +- * More detailed API * +- * ----------------------------------------------------------- */ ++/* ++ * More detailed API ++ */ + + /** + * Print available options (high- and low-level) to stdout. This is +@@ -910,12 +931,13 @@ void fuse_lib_help(struct fuse_args *args); + * @return the created FUSE handle + */ + #if FUSE_USE_VERSION == 30 +-struct fuse *fuse_new_30(struct fuse_args *args, const struct fuse_operations *op, +- size_t op_size, void *private_data); ++struct fuse *fuse_new_30(struct fuse_args *args, ++ const struct fuse_operations *op, size_t op_size, ++ void *private_data); + #define fuse_new(args, op, size, data) fuse_new_30(args, op, size, data) + #else + struct fuse *fuse_new(struct fuse_args *args, const struct fuse_operations *op, +- size_t op_size, void *private_data); ++ size_t op_size, void *private_data); + #endif + + /** +@@ -940,7 +962,7 @@ void fuse_unmount(struct fuse *f); + /** + * Destroy the FUSE handle. + * +- * NOTE: This function does not unmount the filesystem. If this is ++ * NOTE: This function does not unmount the filesystem. If this is + * needed, call fuse_unmount() before calling this function. + * + * @param f the FUSE handle +@@ -1030,7 +1052,7 @@ int fuse_invalidate_path(struct fuse *f, const char *path); + * Do not call this directly, use fuse_main() + */ + int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, +- size_t op_size, void *private_data); ++ size_t op_size, void *private_data); + + /** + * Start the cleanup thread when using option "remember". +@@ -1081,89 +1103,87 @@ struct fuse_fs; + */ + + int fuse_fs_getattr(struct fuse_fs *fs, const char *path, struct stat *buf, +- struct fuse_file_info *fi); +-int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, +- const char *newpath, unsigned int flags); ++ struct fuse_file_info *fi); ++int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, const char *newpath, ++ unsigned int flags); + int fuse_fs_unlink(struct fuse_fs *fs, const char *path); + int fuse_fs_rmdir(struct fuse_fs *fs, const char *path); +-int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, +- const char *path); ++int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, const char *path); + int fuse_fs_link(struct fuse_fs *fs, const char *oldpath, const char *newpath); +-int fuse_fs_release(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++int fuse_fs_release(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); + int fuse_fs_open(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_read(struct fuse_fs *fs, const char *path, char *buf, size_t size, +- off_t off, struct fuse_file_info *fi); ++ off_t off, struct fuse_file_info *fi); + int fuse_fs_read_buf(struct fuse_fs *fs, const char *path, +- struct fuse_bufvec **bufp, size_t size, off_t off, +- struct fuse_file_info *fi); ++ struct fuse_bufvec **bufp, size_t size, off_t off, ++ struct fuse_file_info *fi); + int fuse_fs_write(struct fuse_fs *fs, const char *path, const char *buf, +- size_t size, off_t off, struct fuse_file_info *fi); ++ size_t size, off_t off, struct fuse_file_info *fi); + int fuse_fs_write_buf(struct fuse_fs *fs, const char *path, +- struct fuse_bufvec *buf, off_t off, +- struct fuse_file_info *fi); ++ struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *fi); + int fuse_fs_fsync(struct fuse_fs *fs, const char *path, int datasync, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_flush(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_statfs(struct fuse_fs *fs, const char *path, struct statvfs *buf); + int fuse_fs_opendir(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_readdir(struct fuse_fs *fs, const char *path, void *buf, +- fuse_fill_dir_t filler, off_t off, +- struct fuse_file_info *fi, enum fuse_readdir_flags flags); ++ fuse_fill_dir_t filler, off_t off, ++ struct fuse_file_info *fi, enum fuse_readdir_flags flags); + int fuse_fs_fsyncdir(struct fuse_fs *fs, const char *path, int datasync, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_releasedir(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_create(struct fuse_fs *fs, const char *path, mode_t mode, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_lock(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi, int cmd, struct flock *lock); ++ struct fuse_file_info *fi, int cmd, struct flock *lock); + int fuse_fs_flock(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi, int op); ++ struct fuse_file_info *fi, int op); + int fuse_fs_chmod(struct fuse_fs *fs, const char *path, mode_t mode, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_chown(struct fuse_fs *fs, const char *path, uid_t uid, gid_t gid, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_truncate(struct fuse_fs *fs, const char *path, off_t size, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_utimens(struct fuse_fs *fs, const char *path, +- const struct timespec tv[2], struct fuse_file_info *fi); ++ const struct timespec tv[2], struct fuse_file_info *fi); + int fuse_fs_access(struct fuse_fs *fs, const char *path, int mask); + int fuse_fs_readlink(struct fuse_fs *fs, const char *path, char *buf, +- size_t len); ++ size_t len); + int fuse_fs_mknod(struct fuse_fs *fs, const char *path, mode_t mode, +- dev_t rdev); ++ dev_t rdev); + int fuse_fs_mkdir(struct fuse_fs *fs, const char *path, mode_t mode); + int fuse_fs_setxattr(struct fuse_fs *fs, const char *path, const char *name, +- const char *value, size_t size, int flags); ++ const char *value, size_t size, int flags); + int fuse_fs_getxattr(struct fuse_fs *fs, const char *path, const char *name, +- char *value, size_t size); ++ char *value, size_t size); + int fuse_fs_listxattr(struct fuse_fs *fs, const char *path, char *list, +- size_t size); +-int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, +- const char *name); ++ size_t size); ++int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, const char *name); + int fuse_fs_bmap(struct fuse_fs *fs, const char *path, size_t blocksize, +- uint64_t *idx); ++ uint64_t *idx); + int fuse_fs_ioctl(struct fuse_fs *fs, const char *path, unsigned int cmd, +- void *arg, struct fuse_file_info *fi, unsigned int flags, +- void *data); ++ void *arg, struct fuse_file_info *fi, unsigned int flags, ++ void *data); + int fuse_fs_poll(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi, struct fuse_pollhandle *ph, +- unsigned *reventsp); ++ struct fuse_file_info *fi, struct fuse_pollhandle *ph, ++ unsigned *reventsp); + int fuse_fs_fallocate(struct fuse_fs *fs, const char *path, int mode, +- off_t offset, off_t length, struct fuse_file_info *fi); ++ off_t offset, off_t length, struct fuse_file_info *fi); + ssize_t fuse_fs_copy_file_range(struct fuse_fs *fs, const char *path_in, +- struct fuse_file_info *fi_in, off_t off_in, +- const char *path_out, +- struct fuse_file_info *fi_out, off_t off_out, +- size_t len, int flags); ++ struct fuse_file_info *fi_in, off_t off_in, ++ const char *path_out, ++ struct fuse_file_info *fi_out, off_t off_out, ++ size_t len, int flags); + off_t fuse_fs_lseek(struct fuse_fs *fs, const char *path, off_t off, int whence, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + void fuse_fs_init(struct fuse_fs *fs, struct fuse_conn_info *conn, +- struct fuse_config *cfg); ++ struct fuse_config *cfg); + void fuse_fs_destroy(struct fuse_fs *fs); + + int fuse_notify_poll(struct fuse_pollhandle *ph); +@@ -1182,7 +1202,7 @@ int fuse_notify_poll(struct fuse_pollhandle *ph); + * @return a new filesystem object + */ + struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, +- void *private_data); ++ void *private_data); + + /** + * Factory for creating filesystem objects +@@ -1199,7 +1219,7 @@ struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, + * @return the new filesystem object + */ + typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, +- struct fuse_fs *fs[]); ++ struct fuse_fs *fs[]); + /** + * Register filesystem module + * +@@ -1211,7 +1231,7 @@ typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, + * @param factory_ the factory function for this filesystem module + */ + #define FUSE_REGISTER_MODULE(name_, factory_) \ +- fuse_module_factory_t fuse_module_ ## name_ ## _factory = factory_ ++ fuse_module_factory_t fuse_module_##name_##_factory = factory_ + + /** Get session from fuse object */ + struct fuse_session *fuse_get_session(struct fuse *f); +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index bf8f8cc..bd9bf86 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -1,21 +1,23 @@ +-/* FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++/* ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + /** @file */ + + #if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_) +-#error "Never include directly; use or instead." ++#error \ ++ "Never include directly; use or instead." + #endif + + #ifndef FUSE_COMMON_H_ + #define FUSE_COMMON_H_ + +-#include "fuse_opt.h" + #include "fuse_log.h" ++#include "fuse_opt.h" + #include + #include + +@@ -25,7 +27,7 @@ + /** Minor version of FUSE library interface */ + #define FUSE_MINOR_VERSION 2 + +-#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) ++#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) + #define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) + + /** +@@ -38,67 +40,83 @@ + * descriptors can share a single file handle. + */ + struct fuse_file_info { +- /** Open flags. Available in open() and release() */ +- int flags; +- +- /** In case of a write operation indicates if this was caused +- by a delayed write from the page cache. If so, then the +- context's pid, uid, and gid fields will not be valid, and +- the *fh* value may not match the *fh* value that would +- have been sent with the corresponding individual write +- requests if write caching had been disabled. */ +- unsigned int writepage : 1; +- +- /** Can be filled in by open, to use direct I/O on this file. */ +- unsigned int direct_io : 1; +- +- /** Can be filled in by open. It signals the kernel that any +- currently cached file data (ie., data that the filesystem +- provided the last time the file was open) need not be +- invalidated. Has no effect when set in other contexts (in +- particular it does nothing when set by opendir()). */ +- unsigned int keep_cache : 1; +- +- /** Indicates a flush operation. Set in flush operation, also +- maybe set in highlevel lock operation and lowlevel release +- operation. */ +- unsigned int flush : 1; +- +- /** Can be filled in by open, to indicate that the file is not +- seekable. */ +- unsigned int nonseekable : 1; +- +- /* Indicates that flock locks for this file should be +- released. If set, lock_owner shall contain a valid value. +- May only be set in ->release(). */ +- unsigned int flock_release : 1; +- +- /** Can be filled in by opendir. It signals the kernel to +- enable caching of entries returned by readdir(). Has no +- effect when set in other contexts (in particular it does +- nothing when set by open()). */ +- unsigned int cache_readdir : 1; +- +- /** Padding. Reserved for future use*/ +- unsigned int padding : 25; +- unsigned int padding2 : 32; +- +- /** File handle id. May be filled in by filesystem in create, +- * open, and opendir(). Available in most other file operations on the +- * same file handle. */ +- uint64_t fh; +- +- /** Lock owner id. Available in locking operations and flush */ +- uint64_t lock_owner; +- +- /** Requested poll events. Available in ->poll. Only set on kernels +- which support it. If unsupported, this field is set to zero. */ +- uint32_t poll_events; ++ /** Open flags. Available in open() and release() */ ++ int flags; ++ ++ /* ++ * In case of a write operation indicates if this was caused ++ * by a delayed write from the page cache. If so, then the ++ * context's pid, uid, and gid fields will not be valid, and ++ * the *fh* value may not match the *fh* value that would ++ * have been sent with the corresponding individual write ++ * requests if write caching had been disabled. ++ */ ++ unsigned int writepage:1; ++ ++ /** Can be filled in by open, to use direct I/O on this file. */ ++ unsigned int direct_io:1; ++ ++ /* ++ * Can be filled in by open. It signals the kernel that any ++ * currently cached file data (ie., data that the filesystem ++ * provided the last time the file was open) need not be ++ * invalidated. Has no effect when set in other contexts (in ++ * particular it does nothing when set by opendir()). ++ */ ++ unsigned int keep_cache:1; ++ ++ /* ++ * Indicates a flush operation. Set in flush operation, also ++ * maybe set in highlevel lock operation and lowlevel release ++ * operation. ++ */ ++ unsigned int flush:1; ++ ++ /* ++ * Can be filled in by open, to indicate that the file is not ++ * seekable. ++ */ ++ unsigned int nonseekable:1; ++ ++ /* ++ * Indicates that flock locks for this file should be ++ * released. If set, lock_owner shall contain a valid value. ++ * May only be set in ->release(). ++ */ ++ unsigned int flock_release:1; ++ ++ /* ++ * Can be filled in by opendir. It signals the kernel to ++ * enable caching of entries returned by readdir(). Has no ++ * effect when set in other contexts (in particular it does ++ * nothing when set by open()). ++ */ ++ unsigned int cache_readdir:1; ++ ++ /** Padding. Reserved for future use*/ ++ unsigned int padding:25; ++ unsigned int padding2:32; ++ ++ /* ++ * File handle id. May be filled in by filesystem in create, ++ * open, and opendir(). Available in most other file operations on the ++ * same file handle. ++ */ ++ uint64_t fh; ++ ++ /** Lock owner id. Available in locking operations and flush */ ++ uint64_t lock_owner; ++ ++ /* ++ * Requested poll events. Available in ->poll. Only set on kernels ++ * which support it. If unsupported, this field is set to zero. ++ */ ++ uint32_t poll_events; + }; + +-/************************************************************************** +- * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' * +- **************************************************************************/ ++/* ++ * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' ++ */ + + /** + * Indicates that the filesystem supports asynchronous read requests. +@@ -110,7 +128,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_ASYNC_READ (1 << 0) ++#define FUSE_CAP_ASYNC_READ (1 << 0) + + /** + * Indicates that the filesystem supports "remote" locking. +@@ -118,7 +136,7 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel, + * and if getlk() and setlk() handlers are implemented. + */ +-#define FUSE_CAP_POSIX_LOCKS (1 << 1) ++#define FUSE_CAP_POSIX_LOCKS (1 << 1) + + /** + * Indicates that the filesystem supports the O_TRUNC open flag. If +@@ -127,14 +145,14 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) ++#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) + + /** + * Indicates that the filesystem supports lookups of "." and "..". + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) ++#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) + + /** + * Indicates that the kernel should not apply the umask to the +@@ -142,7 +160,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_DONT_MASK (1 << 6) ++#define FUSE_CAP_DONT_MASK (1 << 6) + + /** + * Indicates that libfuse should try to use splice() when writing to +@@ -150,7 +168,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_SPLICE_WRITE (1 << 7) ++#define FUSE_CAP_SPLICE_WRITE (1 << 7) + + /** + * Indicates that libfuse should try to move pages instead of copying when +@@ -158,7 +176,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_SPLICE_MOVE (1 << 8) ++#define FUSE_CAP_SPLICE_MOVE (1 << 8) + + /** + * Indicates that libfuse should try to use splice() when reading from +@@ -167,7 +185,7 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a write_buf() handler. + */ +-#define FUSE_CAP_SPLICE_READ (1 << 9) ++#define FUSE_CAP_SPLICE_READ (1 << 9) + + /** + * If set, the calls to flock(2) will be emulated using POSIX locks and must +@@ -180,14 +198,14 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a flock() handler. + */ +-#define FUSE_CAP_FLOCK_LOCKS (1 << 10) ++#define FUSE_CAP_FLOCK_LOCKS (1 << 10) + + /** + * Indicates that the filesystem supports ioctl's on directories. + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_IOCTL_DIR (1 << 11) ++#define FUSE_CAP_IOCTL_DIR (1 << 11) + + /** + * Traditionally, while a file is open the FUSE kernel module only +@@ -209,7 +227,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) ++#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) + + /** + * Indicates that the filesystem supports readdirplus. +@@ -217,7 +235,7 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel and if the + * filesystem implements a readdirplus() handler. + */ +-#define FUSE_CAP_READDIRPLUS (1 << 13) ++#define FUSE_CAP_READDIRPLUS (1 << 13) + + /** + * Indicates that the filesystem supports adaptive readdirplus. +@@ -245,7 +263,7 @@ struct fuse_file_info { + * if the filesystem implements both a readdirplus() and a readdir() + * handler. + */ +-#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) ++#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) + + /** + * Indicates that the filesystem supports asynchronous direct I/O submission. +@@ -256,7 +274,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_ASYNC_DIO (1 << 15) ++#define FUSE_CAP_ASYNC_DIO (1 << 15) + + /** + * Indicates that writeback caching should be enabled. This means that +@@ -265,7 +283,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) ++#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) + + /** + * Indicates support for zero-message opens. If this flag is set in +@@ -278,7 +296,7 @@ struct fuse_file_info { + * Setting (or unsetting) this flag in the `want` field has *no + * effect*. + */ +-#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) ++#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) + + /** + * Indicates support for parallel directory operations. If this flag +@@ -288,7 +306,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) ++#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) + + /** + * Indicates support for POSIX ACLs. +@@ -307,7 +325,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_POSIX_ACL (1 << 19) ++#define FUSE_CAP_POSIX_ACL (1 << 19) + + /** + * Indicates that the filesystem is responsible for unsetting +@@ -316,7 +334,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) ++#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) + + /** + * Indicates support for zero-message opendirs. If this flag is set in +@@ -328,7 +346,7 @@ struct fuse_file_info { + * + * Setting (or unsetting) this flag in the `want` field has *no effect*. + */ +-#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) ++#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) + + /** + * Ioctl flags +@@ -340,12 +358,12 @@ struct fuse_file_info { + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +-#define FUSE_IOCTL_COMPAT (1 << 0) +-#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +-#define FUSE_IOCTL_RETRY (1 << 2) +-#define FUSE_IOCTL_DIR (1 << 4) ++#define FUSE_IOCTL_COMPAT (1 << 0) ++#define FUSE_IOCTL_UNRESTRICTED (1 << 1) ++#define FUSE_IOCTL_RETRY (1 << 2) ++#define FUSE_IOCTL_DIR (1 << 4) + +-#define FUSE_IOCTL_MAX_IOV 256 ++#define FUSE_IOCTL_MAX_IOV 256 + + /** + * Connection information, passed to the ->init() method +@@ -355,114 +373,114 @@ struct fuse_file_info { + * value must usually be smaller than the indicated value. + */ + struct fuse_conn_info { +- /** +- * Major version of the protocol (read-only) +- */ +- unsigned proto_major; +- +- /** +- * Minor version of the protocol (read-only) +- */ +- unsigned proto_minor; +- +- /** +- * Maximum size of the write buffer +- */ +- unsigned max_write; +- +- /** +- * Maximum size of read requests. A value of zero indicates no +- * limit. However, even if the filesystem does not specify a +- * limit, the maximum size of read requests will still be +- * limited by the kernel. +- * +- * NOTE: For the time being, the maximum size of read requests +- * must be set both here *and* passed to fuse_session_new() +- * using the ``-o max_read=`` mount option. At some point +- * in the future, specifying the mount option will no longer +- * be necessary. +- */ +- unsigned max_read; +- +- /** +- * Maximum readahead +- */ +- unsigned max_readahead; +- +- /** +- * Capability flags that the kernel supports (read-only) +- */ +- unsigned capable; +- +- /** +- * Capability flags that the filesystem wants to enable. +- * +- * libfuse attempts to initialize this field with +- * reasonable default values before calling the init() handler. +- */ +- unsigned want; +- +- /** +- * Maximum number of pending "background" requests. A +- * background request is any type of request for which the +- * total number is not limited by other means. As of kernel +- * 4.8, only two types of requests fall into this category: +- * +- * 1. Read-ahead requests +- * 2. Asynchronous direct I/O requests +- * +- * Read-ahead requests are generated (if max_readahead is +- * non-zero) by the kernel to preemptively fill its caches +- * when it anticipates that userspace will soon read more +- * data. +- * +- * Asynchronous direct I/O requests are generated if +- * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large +- * direct I/O request. In this case the kernel will internally +- * split it up into multiple smaller requests and submit them +- * to the filesystem concurrently. +- * +- * Note that the following requests are *not* background +- * requests: writeback requests (limited by the kernel's +- * flusher algorithm), regular (i.e., synchronous and +- * buffered) userspace read/write requests (limited to one per +- * thread), asynchronous read requests (Linux's io_submit(2) +- * call actually blocks, so these are also limited to one per +- * thread). +- */ +- unsigned max_background; +- +- /** +- * Kernel congestion threshold parameter. If the number of pending +- * background requests exceeds this number, the FUSE kernel module will +- * mark the filesystem as "congested". This instructs the kernel to +- * expect that queued requests will take some time to complete, and to +- * adjust its algorithms accordingly (e.g. by putting a waiting thread +- * to sleep instead of using a busy-loop). +- */ +- unsigned congestion_threshold; +- +- /** +- * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible +- * for updating mtime and ctime when write requests are received. The +- * updated values are passed to the filesystem with setattr() requests. +- * However, if the filesystem does not support the full resolution of +- * the kernel timestamps (nanoseconds), the mtime and ctime values used +- * by kernel and filesystem will differ (and result in an apparent +- * change of times after a cache flush). +- * +- * To prevent this problem, this variable can be used to inform the +- * kernel about the timestamp granularity supported by the file-system. +- * The value should be power of 10. The default is 1, i.e. full +- * nano-second resolution. Filesystems supporting only second resolution +- * should set this to 1000000000. +- */ +- unsigned time_gran; +- +- /** +- * For future use. +- */ +- unsigned reserved[22]; ++ /** ++ * Major version of the protocol (read-only) ++ */ ++ unsigned proto_major; ++ ++ /** ++ * Minor version of the protocol (read-only) ++ */ ++ unsigned proto_minor; ++ ++ /** ++ * Maximum size of the write buffer ++ */ ++ unsigned max_write; ++ ++ /** ++ * Maximum size of read requests. A value of zero indicates no ++ * limit. However, even if the filesystem does not specify a ++ * limit, the maximum size of read requests will still be ++ * limited by the kernel. ++ * ++ * NOTE: For the time being, the maximum size of read requests ++ * must be set both here *and* passed to fuse_session_new() ++ * using the ``-o max_read=`` mount option. At some point ++ * in the future, specifying the mount option will no longer ++ * be necessary. ++ */ ++ unsigned max_read; ++ ++ /** ++ * Maximum readahead ++ */ ++ unsigned max_readahead; ++ ++ /** ++ * Capability flags that the kernel supports (read-only) ++ */ ++ unsigned capable; ++ ++ /** ++ * Capability flags that the filesystem wants to enable. ++ * ++ * libfuse attempts to initialize this field with ++ * reasonable default values before calling the init() handler. ++ */ ++ unsigned want; ++ ++ /** ++ * Maximum number of pending "background" requests. A ++ * background request is any type of request for which the ++ * total number is not limited by other means. As of kernel ++ * 4.8, only two types of requests fall into this category: ++ * ++ * 1. Read-ahead requests ++ * 2. Asynchronous direct I/O requests ++ * ++ * Read-ahead requests are generated (if max_readahead is ++ * non-zero) by the kernel to preemptively fill its caches ++ * when it anticipates that userspace will soon read more ++ * data. ++ * ++ * Asynchronous direct I/O requests are generated if ++ * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large ++ * direct I/O request. In this case the kernel will internally ++ * split it up into multiple smaller requests and submit them ++ * to the filesystem concurrently. ++ * ++ * Note that the following requests are *not* background ++ * requests: writeback requests (limited by the kernel's ++ * flusher algorithm), regular (i.e., synchronous and ++ * buffered) userspace read/write requests (limited to one per ++ * thread), asynchronous read requests (Linux's io_submit(2) ++ * call actually blocks, so these are also limited to one per ++ * thread). ++ */ ++ unsigned max_background; ++ ++ /** ++ * Kernel congestion threshold parameter. If the number of pending ++ * background requests exceeds this number, the FUSE kernel module will ++ * mark the filesystem as "congested". This instructs the kernel to ++ * expect that queued requests will take some time to complete, and to ++ * adjust its algorithms accordingly (e.g. by putting a waiting thread ++ * to sleep instead of using a busy-loop). ++ */ ++ unsigned congestion_threshold; ++ ++ /** ++ * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible ++ * for updating mtime and ctime when write requests are received. The ++ * updated values are passed to the filesystem with setattr() requests. ++ * However, if the filesystem does not support the full resolution of ++ * the kernel timestamps (nanoseconds), the mtime and ctime values used ++ * by kernel and filesystem will differ (and result in an apparent ++ * change of times after a cache flush). ++ * ++ * To prevent this problem, this variable can be used to inform the ++ * kernel about the timestamp granularity supported by the file-system. ++ * The value should be power of 10. The default is 1, i.e. full ++ * nano-second resolution. Filesystems supporting only second resolution ++ * should set this to 1000000000. ++ */ ++ unsigned time_gran; ++ ++ /** ++ * For future use. ++ */ ++ unsigned reserved[22]; + }; + + struct fuse_session; +@@ -489,21 +507,20 @@ struct fuse_conn_info_opts; + * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want + * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want + * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want +- * -o no_remote_lock Equivalent to -o no_remote_flock,no_remote_posix_lock +- * -o no_remote_flock Unsets FUSE_CAP_FLOCK_LOCKS in conn->want +- * -o no_remote_posix_lock Unsets FUSE_CAP_POSIX_LOCKS in conn->want +- * -o [no_]splice_write (un-)sets FUSE_CAP_SPLICE_WRITE in conn->want +- * -o [no_]splice_move (un-)sets FUSE_CAP_SPLICE_MOVE in conn->want +- * -o [no_]splice_read (un-)sets FUSE_CAP_SPLICE_READ in conn->want +- * -o [no_]auto_inval_data (un-)sets FUSE_CAP_AUTO_INVAL_DATA in conn->want +- * -o readdirplus=no unsets FUSE_CAP_READDIRPLUS in conn->want +- * -o readdirplus=yes sets FUSE_CAP_READDIRPLUS and unsets +- * FUSE_CAP_READDIRPLUS_AUTO in conn->want +- * -o readdirplus=auto sets FUSE_CAP_READDIRPLUS and +- * FUSE_CAP_READDIRPLUS_AUTO in conn->want +- * -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in conn->want +- * -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in conn->want +- * -o time_gran=N sets conn->time_gran ++ * -o no_remote_lock Equivalent to -o ++ *no_remote_flock,no_remote_posix_lock -o no_remote_flock Unsets ++ *FUSE_CAP_FLOCK_LOCKS in conn->want -o no_remote_posix_lock Unsets ++ *FUSE_CAP_POSIX_LOCKS in conn->want -o [no_]splice_write (un-)sets ++ *FUSE_CAP_SPLICE_WRITE in conn->want -o [no_]splice_move (un-)sets ++ *FUSE_CAP_SPLICE_MOVE in conn->want -o [no_]splice_read (un-)sets ++ *FUSE_CAP_SPLICE_READ in conn->want -o [no_]auto_inval_data (un-)sets ++ *FUSE_CAP_AUTO_INVAL_DATA in conn->want -o readdirplus=no unsets ++ *FUSE_CAP_READDIRPLUS in conn->want -o readdirplus=yes sets ++ *FUSE_CAP_READDIRPLUS and unsets FUSE_CAP_READDIRPLUS_AUTO in conn->want -o ++ *readdirplus=auto sets FUSE_CAP_READDIRPLUS and FUSE_CAP_READDIRPLUS_AUTO ++ *in conn->want -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in ++ *conn->want -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in ++ *conn->want -o time_gran=N sets conn->time_gran + * + * Known options will be removed from *args*, unknown options will be + * passed through unchanged. +@@ -511,7 +528,7 @@ struct fuse_conn_info_opts; + * @param args argument vector (input+output) + * @return parsed options + **/ +-struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args); ++struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args); + + /** + * This function applies the (parsed) parameters in *opts* to the +@@ -521,7 +538,7 @@ struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args); + * option has been explicitly set. + */ + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, +- struct fuse_conn_info *conn); ++ struct fuse_conn_info *conn); + + /** + * Go into the background +@@ -552,81 +569,81 @@ const char *fuse_pkgversion(void); + */ + void fuse_pollhandle_destroy(struct fuse_pollhandle *ph); + +-/* ----------------------------------------------------------- * +- * Data buffer * +- * ----------------------------------------------------------- */ ++/* ++ * Data buffer ++ */ + + /** + * Buffer flags + */ + enum fuse_buf_flags { +- /** +- * Buffer contains a file descriptor +- * +- * If this flag is set, the .fd field is valid, otherwise the +- * .mem fields is valid. +- */ +- FUSE_BUF_IS_FD = (1 << 1), +- +- /** +- * Seek on the file descriptor +- * +- * If this flag is set then the .pos field is valid and is +- * used to seek to the given offset before performing +- * operation on file descriptor. +- */ +- FUSE_BUF_FD_SEEK = (1 << 2), +- +- /** +- * Retry operation on file descriptor +- * +- * If this flag is set then retry operation on file descriptor +- * until .size bytes have been copied or an error or EOF is +- * detected. +- */ +- FUSE_BUF_FD_RETRY = (1 << 3), ++ /** ++ * Buffer contains a file descriptor ++ * ++ * If this flag is set, the .fd field is valid, otherwise the ++ * .mem fields is valid. ++ */ ++ FUSE_BUF_IS_FD = (1 << 1), ++ ++ /** ++ * Seek on the file descriptor ++ * ++ * If this flag is set then the .pos field is valid and is ++ * used to seek to the given offset before performing ++ * operation on file descriptor. ++ */ ++ FUSE_BUF_FD_SEEK = (1 << 2), ++ ++ /** ++ * Retry operation on file descriptor ++ * ++ * If this flag is set then retry operation on file descriptor ++ * until .size bytes have been copied or an error or EOF is ++ * detected. ++ */ ++ FUSE_BUF_FD_RETRY = (1 << 3), + }; + + /** + * Buffer copy flags + */ + enum fuse_buf_copy_flags { +- /** +- * Don't use splice(2) +- * +- * Always fall back to using read and write instead of +- * splice(2) to copy data from one file descriptor to another. +- * +- * If this flag is not set, then only fall back if splice is +- * unavailable. +- */ +- FUSE_BUF_NO_SPLICE = (1 << 1), +- +- /** +- * Force splice +- * +- * Always use splice(2) to copy data from one file descriptor +- * to another. If splice is not available, return -EINVAL. +- */ +- FUSE_BUF_FORCE_SPLICE = (1 << 2), +- +- /** +- * Try to move data with splice. +- * +- * If splice is used, try to move pages from the source to the +- * destination instead of copying. See documentation of +- * SPLICE_F_MOVE in splice(2) man page. +- */ +- FUSE_BUF_SPLICE_MOVE = (1 << 3), +- +- /** +- * Don't block on the pipe when copying data with splice +- * +- * Makes the operations on the pipe non-blocking (if the pipe +- * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) +- * man page. +- */ +- FUSE_BUF_SPLICE_NONBLOCK= (1 << 4), ++ /** ++ * Don't use splice(2) ++ * ++ * Always fall back to using read and write instead of ++ * splice(2) to copy data from one file descriptor to another. ++ * ++ * If this flag is not set, then only fall back if splice is ++ * unavailable. ++ */ ++ FUSE_BUF_NO_SPLICE = (1 << 1), ++ ++ /** ++ * Force splice ++ * ++ * Always use splice(2) to copy data from one file descriptor ++ * to another. If splice is not available, return -EINVAL. ++ */ ++ FUSE_BUF_FORCE_SPLICE = (1 << 2), ++ ++ /** ++ * Try to move data with splice. ++ * ++ * If splice is used, try to move pages from the source to the ++ * destination instead of copying. See documentation of ++ * SPLICE_F_MOVE in splice(2) man page. ++ */ ++ FUSE_BUF_SPLICE_MOVE = (1 << 3), ++ ++ /** ++ * Don't block on the pipe when copying data with splice ++ * ++ * Makes the operations on the pipe non-blocking (if the pipe ++ * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) ++ * man page. ++ */ ++ FUSE_BUF_SPLICE_NONBLOCK = (1 << 4), + }; + + /** +@@ -636,36 +653,36 @@ enum fuse_buf_copy_flags { + * be supplied as a memory pointer or as a file descriptor + */ + struct fuse_buf { +- /** +- * Size of data in bytes +- */ +- size_t size; +- +- /** +- * Buffer flags +- */ +- enum fuse_buf_flags flags; +- +- /** +- * Memory pointer +- * +- * Used unless FUSE_BUF_IS_FD flag is set. +- */ +- void *mem; +- +- /** +- * File descriptor +- * +- * Used if FUSE_BUF_IS_FD flag is set. +- */ +- int fd; +- +- /** +- * File position +- * +- * Used if FUSE_BUF_FD_SEEK flag is set. +- */ +- off_t pos; ++ /** ++ * Size of data in bytes ++ */ ++ size_t size; ++ ++ /** ++ * Buffer flags ++ */ ++ enum fuse_buf_flags flags; ++ ++ /** ++ * Memory pointer ++ * ++ * Used unless FUSE_BUF_IS_FD flag is set. ++ */ ++ void *mem; ++ ++ /** ++ * File descriptor ++ * ++ * Used if FUSE_BUF_IS_FD flag is set. ++ */ ++ int fd; ++ ++ /** ++ * File position ++ * ++ * Used if FUSE_BUF_FD_SEEK flag is set. ++ */ ++ off_t pos; + }; + + /** +@@ -677,41 +694,39 @@ struct fuse_buf { + * Allocate dynamically to add more than one buffer. + */ + struct fuse_bufvec { +- /** +- * Number of buffers in the array +- */ +- size_t count; +- +- /** +- * Index of current buffer within the array +- */ +- size_t idx; +- +- /** +- * Current offset within the current buffer +- */ +- size_t off; +- +- /** +- * Array of buffers +- */ +- struct fuse_buf buf[1]; ++ /** ++ * Number of buffers in the array ++ */ ++ size_t count; ++ ++ /** ++ * Index of current buffer within the array ++ */ ++ size_t idx; ++ ++ /** ++ * Current offset within the current buffer ++ */ ++ size_t off; ++ ++ /** ++ * Array of buffers ++ */ ++ struct fuse_buf buf[1]; + }; + + /* Initialize bufvec with a single buffer of given size */ +-#define FUSE_BUFVEC_INIT(size__) \ +- ((struct fuse_bufvec) { \ +- /* .count= */ 1, \ +- /* .idx = */ 0, \ +- /* .off = */ 0, \ +- /* .buf = */ { /* [0] = */ { \ +- /* .size = */ (size__), \ +- /* .flags = */ (enum fuse_buf_flags) 0, \ +- /* .mem = */ NULL, \ +- /* .fd = */ -1, \ +- /* .pos = */ 0, \ +- } } \ +- } ) ++#define FUSE_BUFVEC_INIT(size__) \ ++ ((struct fuse_bufvec){ /* .count= */ 1, \ ++ /* .idx = */ 0, \ ++ /* .off = */ 0, /* .buf = */ \ ++ { /* [0] = */ { \ ++ /* .size = */ (size__), \ ++ /* .flags = */ (enum fuse_buf_flags)0, \ ++ /* .mem = */ NULL, \ ++ /* .fd = */ -1, \ ++ /* .pos = */ 0, \ ++ } } }) + + /** + * Get total size of data in a fuse buffer vector +@@ -730,16 +745,16 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv); + * @return actual number of bytes copied or -errno on error + */ + ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src, +- enum fuse_buf_copy_flags flags); ++ enum fuse_buf_copy_flags flags); + +-/* ----------------------------------------------------------- * +- * Signal handling * +- * ----------------------------------------------------------- */ ++/* ++ * Signal handling ++ */ + + /** + * Exit session on HUP, TERM and INT signals and ignore PIPE signal + * +- * Stores session in a global variable. May only be called once per ++ * Stores session in a global variable. May only be called once per + * process until fuse_remove_signal_handlers() is called. + * + * Once either of the POSIX signals arrives, the signal handler calls +@@ -766,12 +781,12 @@ int fuse_set_signal_handlers(struct fuse_session *se); + */ + void fuse_remove_signal_handlers(struct fuse_session *se); + +-/* ----------------------------------------------------------- * +- * Compatibility stuff * +- * ----------------------------------------------------------- */ ++/* ++ * Compatibility stuff ++ */ + + #if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30 +-# error only API version 30 or greater is supported ++#error only API version 30 or greater is supported + #endif + + +@@ -781,11 +796,14 @@ void fuse_remove_signal_handlers(struct fuse_session *se); + * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags! + */ + +-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++#if defined(__GNUC__) && \ ++ (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ ++ !defined __cplusplus + _Static_assert(sizeof(off_t) == 8, "fuse: off_t must be 64bit"); + #else +-struct _fuse_off_t_must_be_64bit_dummy_struct \ +- { unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); }; ++struct _fuse_off_t_must_be_64bit_dummy_struct { ++ unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); ++}; + #endif + + #endif /* FUSE_COMMON_H_ */ +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index b39522e..e63cb58 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -1,71 +1,71 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include "fuse.h" + #include "fuse_lowlevel.h" + + struct fuse_req { +- struct fuse_session *se; +- uint64_t unique; +- int ctr; +- pthread_mutex_t lock; +- struct fuse_ctx ctx; +- struct fuse_chan *ch; +- int interrupted; +- unsigned int ioctl_64bit : 1; +- union { +- struct { +- uint64_t unique; +- } i; +- struct { +- fuse_interrupt_func_t func; +- void *data; +- } ni; +- } u; +- struct fuse_req *next; +- struct fuse_req *prev; ++ struct fuse_session *se; ++ uint64_t unique; ++ int ctr; ++ pthread_mutex_t lock; ++ struct fuse_ctx ctx; ++ struct fuse_chan *ch; ++ int interrupted; ++ unsigned int ioctl_64bit:1; ++ union { ++ struct { ++ uint64_t unique; ++ } i; ++ struct { ++ fuse_interrupt_func_t func; ++ void *data; ++ } ni; ++ } u; ++ struct fuse_req *next; ++ struct fuse_req *prev; + }; + + struct fuse_notify_req { +- uint64_t unique; +- void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, +- const void *, const struct fuse_buf *); +- struct fuse_notify_req *next; +- struct fuse_notify_req *prev; ++ uint64_t unique; ++ void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, ++ const void *, const struct fuse_buf *); ++ struct fuse_notify_req *next; ++ struct fuse_notify_req *prev; + }; + + struct fuse_session { +- char *mountpoint; +- volatile int exited; +- int fd; +- int debug; +- int deny_others; +- struct fuse_lowlevel_ops op; +- int got_init; +- struct cuse_data *cuse_data; +- void *userdata; +- uid_t owner; +- struct fuse_conn_info conn; +- struct fuse_req list; +- struct fuse_req interrupts; +- pthread_mutex_t lock; +- int got_destroy; +- int broken_splice_nonblock; +- uint64_t notify_ctr; +- struct fuse_notify_req notify_list; +- size_t bufsize; +- int error; ++ char *mountpoint; ++ volatile int exited; ++ int fd; ++ int debug; ++ int deny_others; ++ struct fuse_lowlevel_ops op; ++ int got_init; ++ struct cuse_data *cuse_data; ++ void *userdata; ++ uid_t owner; ++ struct fuse_conn_info conn; ++ struct fuse_req list; ++ struct fuse_req interrupts; ++ pthread_mutex_t lock; ++ int got_destroy; ++ int broken_splice_nonblock; ++ uint64_t notify_ctr; ++ struct fuse_notify_req notify_list; ++ size_t bufsize; ++ int error; + }; + + struct fuse_chan { +- pthread_mutex_t lock; +- int ctr; +- int fd; ++ pthread_mutex_t lock; ++ int ctr; ++ int fd; + }; + + /** +@@ -76,19 +76,20 @@ struct fuse_chan { + * + */ + struct fuse_module { +- char *name; +- fuse_module_factory_t factory; +- struct fuse_module *next; +- struct fusemod_so *so; +- int ctr; ++ char *name; ++ fuse_module_factory_t factory; ++ struct fuse_module *next; ++ struct fusemod_so *so; ++ int ctr; + }; + + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, +- int count); ++ int count); + void fuse_free_req(fuse_req_t req); + + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, struct fuse_chan *ch); ++ const struct fuse_buf *buf, ++ struct fuse_chan *ch); + + + #define FUSE_MAX_MAX_PAGES 256 +diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c +index 0d268ab..11345f9 100644 +--- a/tools/virtiofsd/fuse_log.c ++++ b/tools/virtiofsd/fuse_log.c +@@ -1,40 +1,40 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2019 Red Hat, Inc. +- +- Logging API. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * Logging API. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include "fuse_log.h" + + #include + #include + +-static void default_log_func( +- __attribute__(( unused )) enum fuse_log_level level, +- const char *fmt, va_list ap) ++static void default_log_func(__attribute__((unused)) enum fuse_log_level level, ++ const char *fmt, va_list ap) + { +- vfprintf(stderr, fmt, ap); ++ vfprintf(stderr, fmt, ap); + } + + static fuse_log_func_t log_func = default_log_func; + + void fuse_set_log_func(fuse_log_func_t func) + { +- if (!func) +- func = default_log_func; ++ if (!func) { ++ func = default_log_func; ++ } + +- log_func = func; ++ log_func = func; + } + + void fuse_log(enum fuse_log_level level, const char *fmt, ...) + { +- va_list ap; ++ va_list ap; + +- va_start(ap, fmt); +- log_func(level, fmt, ap); +- va_end(ap); ++ va_start(ap, fmt); ++ log_func(level, fmt, ap); ++ va_end(ap); + } +diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h +index 0af700d..bf6c11f 100644 +--- a/tools/virtiofsd/fuse_log.h ++++ b/tools/virtiofsd/fuse_log.h +@@ -1,10 +1,10 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2019 Red Hat, Inc. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_LOG_H_ + #define FUSE_LOG_H_ +@@ -22,14 +22,14 @@ + * These levels correspond to syslog(2) log levels since they are widely used. + */ + enum fuse_log_level { +- FUSE_LOG_EMERG, +- FUSE_LOG_ALERT, +- FUSE_LOG_CRIT, +- FUSE_LOG_ERR, +- FUSE_LOG_WARNING, +- FUSE_LOG_NOTICE, +- FUSE_LOG_INFO, +- FUSE_LOG_DEBUG ++ FUSE_LOG_EMERG, ++ FUSE_LOG_ALERT, ++ FUSE_LOG_CRIT, ++ FUSE_LOG_ERR, ++ FUSE_LOG_WARNING, ++ FUSE_LOG_NOTICE, ++ FUSE_LOG_INFO, ++ FUSE_LOG_DEBUG + }; + + /** +@@ -45,8 +45,8 @@ enum fuse_log_level { + * @param fmt sprintf-style format string including newline + * @param ap format string arguments + */ +-typedef void (*fuse_log_func_t)(enum fuse_log_level level, +- const char *fmt, va_list ap); ++typedef void (*fuse_log_func_t)(enum fuse_log_level level, const char *fmt, ++ va_list ap); + + /** + * Install a custom log handler function. +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index e6fa247..5c9cb52 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1,2380 +1,2515 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- Implementation of (most of) the low-level FUSE API. The session loop +- functions are implemented in separate files. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Implementation of (most of) the low-level FUSE API. The session loop ++ * functions are implemented in separate files. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #define _GNU_SOURCE + + #include "config.h" + #include "fuse_i.h" + #include "fuse_kernel.h" +-#include "fuse_opt.h" + #include "fuse_misc.h" ++#include "fuse_opt.h" + ++#include ++#include ++#include ++#include + #include + #include +-#include + #include +-#include +-#include +-#include +-#include + #include +- ++#include + + + #define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + #define OFFSET_MAX 0x7fffffffffffffffLL + +-#define container_of(ptr, type, member) ({ \ +- const typeof( ((type *)0)->member ) *__mptr = (ptr); \ +- (type *)( (char *)__mptr - offsetof(type,member) );}) ++#define container_of(ptr, type, member) \ ++ ({ \ ++ const typeof(((type *)0)->member) *__mptr = (ptr); \ ++ (type *)((char *)__mptr - offsetof(type, member)); \ ++ }) + + struct fuse_pollhandle { +- uint64_t kh; +- struct fuse_session *se; ++ uint64_t kh; ++ struct fuse_session *se; + }; + + static size_t pagesize; + + static __attribute__((constructor)) void fuse_ll_init_pagesize(void) + { +- pagesize = getpagesize(); ++ pagesize = getpagesize(); + } + + static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) + { +- attr->ino = stbuf->st_ino; +- attr->mode = stbuf->st_mode; +- attr->nlink = stbuf->st_nlink; +- attr->uid = stbuf->st_uid; +- attr->gid = stbuf->st_gid; +- attr->rdev = stbuf->st_rdev; +- attr->size = stbuf->st_size; +- attr->blksize = stbuf->st_blksize; +- attr->blocks = stbuf->st_blocks; +- attr->atime = stbuf->st_atime; +- attr->mtime = stbuf->st_mtime; +- attr->ctime = stbuf->st_ctime; +- attr->atimensec = ST_ATIM_NSEC(stbuf); +- attr->mtimensec = ST_MTIM_NSEC(stbuf); +- attr->ctimensec = ST_CTIM_NSEC(stbuf); ++ attr->ino = stbuf->st_ino; ++ attr->mode = stbuf->st_mode; ++ attr->nlink = stbuf->st_nlink; ++ attr->uid = stbuf->st_uid; ++ attr->gid = stbuf->st_gid; ++ attr->rdev = stbuf->st_rdev; ++ attr->size = stbuf->st_size; ++ attr->blksize = stbuf->st_blksize; ++ attr->blocks = stbuf->st_blocks; ++ attr->atime = stbuf->st_atime; ++ attr->mtime = stbuf->st_mtime; ++ attr->ctime = stbuf->st_ctime; ++ attr->atimensec = ST_ATIM_NSEC(stbuf); ++ attr->mtimensec = ST_MTIM_NSEC(stbuf); ++ attr->ctimensec = ST_CTIM_NSEC(stbuf); + } + + static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) + { +- stbuf->st_mode = attr->mode; +- stbuf->st_uid = attr->uid; +- stbuf->st_gid = attr->gid; +- stbuf->st_size = attr->size; +- stbuf->st_atime = attr->atime; +- stbuf->st_mtime = attr->mtime; +- stbuf->st_ctime = attr->ctime; +- ST_ATIM_NSEC_SET(stbuf, attr->atimensec); +- ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); +- ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); ++ stbuf->st_mode = attr->mode; ++ stbuf->st_uid = attr->uid; ++ stbuf->st_gid = attr->gid; ++ stbuf->st_size = attr->size; ++ stbuf->st_atime = attr->atime; ++ stbuf->st_mtime = attr->mtime; ++ stbuf->st_ctime = attr->ctime; ++ ST_ATIM_NSEC_SET(stbuf, attr->atimensec); ++ ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); ++ ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); + } + +-static size_t iov_length(const struct iovec *iov, size_t count) ++static size_t iov_length(const struct iovec *iov, size_t count) + { +- size_t seg; +- size_t ret = 0; ++ size_t seg; ++ size_t ret = 0; + +- for (seg = 0; seg < count; seg++) +- ret += iov[seg].iov_len; +- return ret; ++ for (seg = 0; seg < count; seg++) { ++ ret += iov[seg].iov_len; ++ } ++ return ret; + } + + static void list_init_req(struct fuse_req *req) + { +- req->next = req; +- req->prev = req; ++ req->next = req; ++ req->prev = req; + } + + static void list_del_req(struct fuse_req *req) + { +- struct fuse_req *prev = req->prev; +- struct fuse_req *next = req->next; +- prev->next = next; +- next->prev = prev; ++ struct fuse_req *prev = req->prev; ++ struct fuse_req *next = req->next; ++ prev->next = next; ++ next->prev = prev; + } + + static void list_add_req(struct fuse_req *req, struct fuse_req *next) + { +- struct fuse_req *prev = next->prev; +- req->next = next; +- req->prev = prev; +- prev->next = req; +- next->prev = req; ++ struct fuse_req *prev = next->prev; ++ req->next = next; ++ req->prev = prev; ++ prev->next = req; ++ next->prev = req; + } + + static void destroy_req(fuse_req_t req) + { +- pthread_mutex_destroy(&req->lock); +- free(req); ++ pthread_mutex_destroy(&req->lock); ++ free(req); + } + + void fuse_free_req(fuse_req_t req) + { +- int ctr; +- struct fuse_session *se = req->se; ++ int ctr; ++ struct fuse_session *se = req->se; + +- pthread_mutex_lock(&se->lock); +- req->u.ni.func = NULL; +- req->u.ni.data = NULL; +- list_del_req(req); +- ctr = --req->ctr; +- req->ch = NULL; +- pthread_mutex_unlock(&se->lock); +- if (!ctr) +- destroy_req(req); ++ pthread_mutex_lock(&se->lock); ++ req->u.ni.func = NULL; ++ req->u.ni.data = NULL; ++ list_del_req(req); ++ ctr = --req->ctr; ++ req->ch = NULL; ++ pthread_mutex_unlock(&se->lock); ++ if (!ctr) { ++ destroy_req(req); ++ } + } + + static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se) + { +- struct fuse_req *req; ++ struct fuse_req *req; + +- req = (struct fuse_req *) calloc(1, sizeof(struct fuse_req)); +- if (req == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); +- } else { +- req->se = se; +- req->ctr = 1; +- list_init_req(req); +- fuse_mutex_init(&req->lock); +- } ++ req = (struct fuse_req *)calloc(1, sizeof(struct fuse_req)); ++ if (req == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); ++ } else { ++ req->se = se; ++ req->ctr = 1; ++ list_init_req(req); ++ fuse_mutex_init(&req->lock); ++ } + +- return req; ++ return req; + } + + /* Send data. If *ch* is NULL, send via session master fd */ + static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, +- struct iovec *iov, int count) ++ struct iovec *iov, int count) + { +- struct fuse_out_header *out = iov[0].iov_base; ++ struct fuse_out_header *out = iov[0].iov_base; + +- out->len = iov_length(iov, count); +- if (se->debug) { +- if (out->unique == 0) { +- fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", +- out->error, out->len); +- } else if (out->error) { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, error: %i (%s), outsize: %i\n", +- (unsigned long long) out->unique, out->error, +- strerror(-out->error), out->len); +- } else { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, success, outsize: %i\n", +- (unsigned long long) out->unique, out->len); +- } +- } ++ out->len = iov_length(iov, count); ++ if (se->debug) { ++ if (out->unique == 0) { ++ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, ++ out->len); ++ } else if (out->error) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, error: %i (%s), outsize: %i\n", ++ (unsigned long long)out->unique, out->error, ++ strerror(-out->error), out->len); ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", ++ (unsigned long long)out->unique, out->len); ++ } ++ } + +- abort(); /* virtio should have taken it before here */ +- return 0; ++ abort(); /* virtio should have taken it before here */ ++ return 0; + } + + + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, +- int count) ++ int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out; + +- if (error <= -1000 || error > 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); +- error = -ERANGE; +- } ++ if (error <= -1000 || error > 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); ++ error = -ERANGE; ++ } + +- out.unique = req->unique; +- out.error = error; ++ out.unique = req->unique; ++ out.error = error; + +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(struct fuse_out_header); ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); + +- return fuse_send_msg(req->se, req->ch, iov, count); ++ return fuse_send_msg(req->se, req->ch, iov, count); + } + + static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov, +- int count) ++ int count) + { +- int res; ++ int res; + +- res = fuse_send_reply_iov_nofree(req, error, iov, count); +- fuse_free_req(req); +- return res; ++ res = fuse_send_reply_iov_nofree(req, error, iov, count); ++ fuse_free_req(req); ++ return res; + } + + static int send_reply(fuse_req_t req, int error, const void *arg, +- size_t argsize) ++ size_t argsize) + { +- struct iovec iov[2]; +- int count = 1; +- if (argsize) { +- iov[1].iov_base = (void *) arg; +- iov[1].iov_len = argsize; +- count++; +- } +- return send_reply_iov(req, error, iov, count); ++ struct iovec iov[2]; ++ int count = 1; ++ if (argsize) { ++ iov[1].iov_base = (void *)arg; ++ iov[1].iov_len = argsize; ++ count++; ++ } ++ return send_reply_iov(req, error, iov, count); + } + + int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count) + { +- int res; +- struct iovec *padded_iov; ++ int res; ++ struct iovec *padded_iov; + +- padded_iov = malloc((count + 1) * sizeof(struct iovec)); +- if (padded_iov == NULL) +- return fuse_reply_err(req, ENOMEM); ++ padded_iov = malloc((count + 1) * sizeof(struct iovec)); ++ if (padded_iov == NULL) { ++ return fuse_reply_err(req, ENOMEM); ++ } + +- memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); +- count++; ++ memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); ++ count++; + +- res = send_reply_iov(req, 0, padded_iov, count); +- free(padded_iov); ++ res = send_reply_iov(req, 0, padded_iov, count); ++ free(padded_iov); + +- return res; ++ return res; + } + + +-/* `buf` is allowed to be empty so that the proper size may be +- allocated by the caller */ ++/* ++ * 'buf` is allowed to be empty so that the proper size may be ++ * allocated by the caller ++ */ + size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, const struct stat *stbuf, off_t off) ++ const char *name, const struct stat *stbuf, off_t off) + { +- (void)req; +- size_t namelen; +- size_t entlen; +- size_t entlen_padded; +- struct fuse_dirent *dirent; ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ struct fuse_dirent *dirent; + +- namelen = strlen(name); +- entlen = FUSE_NAME_OFFSET + namelen; +- entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); + +- if ((buf == NULL) || (entlen_padded > bufsize)) +- return entlen_padded; ++ if ((buf == NULL) || (entlen_padded > bufsize)) { ++ return entlen_padded; ++ } + +- dirent = (struct fuse_dirent*) buf; +- dirent->ino = stbuf->st_ino; +- dirent->off = off; +- dirent->namelen = namelen; +- dirent->type = (stbuf->st_mode & S_IFMT) >> 12; +- memcpy(dirent->name, name, namelen); +- memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ dirent = (struct fuse_dirent *)buf; ++ dirent->ino = stbuf->st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (stbuf->st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); + +- return entlen_padded; ++ return entlen_padded; + } + + static void convert_statfs(const struct statvfs *stbuf, +- struct fuse_kstatfs *kstatfs) ++ struct fuse_kstatfs *kstatfs) + { +- kstatfs->bsize = stbuf->f_bsize; +- kstatfs->frsize = stbuf->f_frsize; +- kstatfs->blocks = stbuf->f_blocks; +- kstatfs->bfree = stbuf->f_bfree; +- kstatfs->bavail = stbuf->f_bavail; +- kstatfs->files = stbuf->f_files; +- kstatfs->ffree = stbuf->f_ffree; +- kstatfs->namelen = stbuf->f_namemax; ++ kstatfs->bsize = stbuf->f_bsize; ++ kstatfs->frsize = stbuf->f_frsize; ++ kstatfs->blocks = stbuf->f_blocks; ++ kstatfs->bfree = stbuf->f_bfree; ++ kstatfs->bavail = stbuf->f_bavail; ++ kstatfs->files = stbuf->f_files; ++ kstatfs->ffree = stbuf->f_ffree; ++ kstatfs->namelen = stbuf->f_namemax; + } + + static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) + { +- return send_reply(req, 0, arg, argsize); ++ return send_reply(req, 0, arg, argsize); + } + + int fuse_reply_err(fuse_req_t req, int err) + { +- return send_reply(req, -err, NULL, 0); ++ return send_reply(req, -err, NULL, 0); + } + + void fuse_reply_none(fuse_req_t req) + { +- fuse_free_req(req); ++ fuse_free_req(req); + } + + static unsigned long calc_timeout_sec(double t) + { +- if (t > (double) ULONG_MAX) +- return ULONG_MAX; +- else if (t < 0.0) +- return 0; +- else +- return (unsigned long) t; ++ if (t > (double)ULONG_MAX) { ++ return ULONG_MAX; ++ } else if (t < 0.0) { ++ return 0; ++ } else { ++ return (unsigned long)t; ++ } + } + + static unsigned int calc_timeout_nsec(double t) + { +- double f = t - (double) calc_timeout_sec(t); +- if (f < 0.0) +- return 0; +- else if (f >= 0.999999999) +- return 999999999; +- else +- return (unsigned int) (f * 1.0e9); ++ double f = t - (double)calc_timeout_sec(t); ++ if (f < 0.0) { ++ return 0; ++ } else if (f >= 0.999999999) { ++ return 999999999; ++ } else { ++ return (unsigned int)(f * 1.0e9); ++ } + } + + static void fill_entry(struct fuse_entry_out *arg, +- const struct fuse_entry_param *e) ++ const struct fuse_entry_param *e) + { +- arg->nodeid = e->ino; +- arg->generation = e->generation; +- arg->entry_valid = calc_timeout_sec(e->entry_timeout); +- arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); +- arg->attr_valid = calc_timeout_sec(e->attr_timeout); +- arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); +- convert_stat(&e->attr, &arg->attr); ++ arg->nodeid = e->ino; ++ arg->generation = e->generation; ++ arg->entry_valid = calc_timeout_sec(e->entry_timeout); ++ arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); ++ arg->attr_valid = calc_timeout_sec(e->attr_timeout); ++ arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); ++ convert_stat(&e->attr, &arg->attr); + } + +-/* `buf` is allowed to be empty so that the proper size may be +- allocated by the caller */ ++/* ++ * `buf` is allowed to be empty so that the proper size may be ++ * allocated by the caller ++ */ + size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, +- const struct fuse_entry_param *e, off_t off) +-{ +- (void)req; +- size_t namelen; +- size_t entlen; +- size_t entlen_padded; +- +- namelen = strlen(name); +- entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; +- entlen_padded = FUSE_DIRENT_ALIGN(entlen); +- if ((buf == NULL) || (entlen_padded > bufsize)) +- return entlen_padded; +- +- struct fuse_direntplus *dp = (struct fuse_direntplus *) buf; +- memset(&dp->entry_out, 0, sizeof(dp->entry_out)); +- fill_entry(&dp->entry_out, e); +- +- struct fuse_dirent *dirent = &dp->dirent; +- dirent->ino = e->attr.st_ino; +- dirent->off = off; +- dirent->namelen = namelen; +- dirent->type = (e->attr.st_mode & S_IFMT) >> 12; +- memcpy(dirent->name, name, namelen); +- memset(dirent->name + namelen, 0, entlen_padded - entlen); +- +- return entlen_padded; +-} +- +-static void fill_open(struct fuse_open_out *arg, +- const struct fuse_file_info *f) +-{ +- arg->fh = f->fh; +- if (f->direct_io) +- arg->open_flags |= FOPEN_DIRECT_IO; +- if (f->keep_cache) +- arg->open_flags |= FOPEN_KEEP_CACHE; +- if (f->cache_readdir) +- arg->open_flags |= FOPEN_CACHE_DIR; +- if (f->nonseekable) +- arg->open_flags |= FOPEN_NONSEEKABLE; ++ const char *name, ++ const struct fuse_entry_param *e, off_t off) ++{ ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ if ((buf == NULL) || (entlen_padded > bufsize)) { ++ return entlen_padded; ++ } ++ ++ struct fuse_direntplus *dp = (struct fuse_direntplus *)buf; ++ memset(&dp->entry_out, 0, sizeof(dp->entry_out)); ++ fill_entry(&dp->entry_out, e); ++ ++ struct fuse_dirent *dirent = &dp->dirent; ++ dirent->ino = e->attr.st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (e->attr.st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ ++ return entlen_padded; ++} ++ ++static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f) ++{ ++ arg->fh = f->fh; ++ if (f->direct_io) { ++ arg->open_flags |= FOPEN_DIRECT_IO; ++ } ++ if (f->keep_cache) { ++ arg->open_flags |= FOPEN_KEEP_CACHE; ++ } ++ if (f->cache_readdir) { ++ arg->open_flags |= FOPEN_CACHE_DIR; ++ } ++ if (f->nonseekable) { ++ arg->open_flags |= FOPEN_NONSEEKABLE; ++ } + } + + int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) + { +- struct fuse_entry_out arg; +- size_t size = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(arg); ++ struct fuse_entry_out arg; ++ size_t size = req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ENTRY_OUT_SIZE : ++ sizeof(arg); + +- /* before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant +- negative entry */ +- if (!e->ino && req->se->conn.proto_minor < 4) +- return fuse_reply_err(req, ENOENT); ++ /* ++ * before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant ++ * negative entry ++ */ ++ if (!e->ino && req->se->conn.proto_minor < 4) { ++ return fuse_reply_err(req, ENOENT); ++ } + +- memset(&arg, 0, sizeof(arg)); +- fill_entry(&arg, e); +- return send_reply_ok(req, &arg, size); ++ memset(&arg, 0, sizeof(arg)); ++ fill_entry(&arg, e); ++ return send_reply_ok(req, &arg, size); + } + + int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, +- const struct fuse_file_info *f) ++ const struct fuse_file_info *f) + { +- char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; +- size_t entrysize = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(struct fuse_entry_out); +- struct fuse_entry_out *earg = (struct fuse_entry_out *) buf; +- struct fuse_open_out *oarg = (struct fuse_open_out *) (buf + entrysize); ++ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; ++ size_t entrysize = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ENTRY_OUT_SIZE : ++ sizeof(struct fuse_entry_out); ++ struct fuse_entry_out *earg = (struct fuse_entry_out *)buf; ++ struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize); + +- memset(buf, 0, sizeof(buf)); +- fill_entry(earg, e); +- fill_open(oarg, f); +- return send_reply_ok(req, buf, +- entrysize + sizeof(struct fuse_open_out)); ++ memset(buf, 0, sizeof(buf)); ++ fill_entry(earg, e); ++ fill_open(oarg, f); ++ return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out)); + } + + int fuse_reply_attr(fuse_req_t req, const struct stat *attr, +- double attr_timeout) ++ double attr_timeout) + { +- struct fuse_attr_out arg; +- size_t size = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); ++ struct fuse_attr_out arg; ++ size_t size = ++ req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); + +- memset(&arg, 0, sizeof(arg)); +- arg.attr_valid = calc_timeout_sec(attr_timeout); +- arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); +- convert_stat(attr, &arg.attr); ++ memset(&arg, 0, sizeof(arg)); ++ arg.attr_valid = calc_timeout_sec(attr_timeout); ++ arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); ++ convert_stat(attr, &arg.attr); + +- return send_reply_ok(req, &arg, size); ++ return send_reply_ok(req, &arg, size); + } + + int fuse_reply_readlink(fuse_req_t req, const char *linkname) + { +- return send_reply_ok(req, linkname, strlen(linkname)); ++ return send_reply_ok(req, linkname, strlen(linkname)); + } + + int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f) + { +- struct fuse_open_out arg; ++ struct fuse_open_out arg; + +- memset(&arg, 0, sizeof(arg)); +- fill_open(&arg, f); +- return send_reply_ok(req, &arg, sizeof(arg)); ++ memset(&arg, 0, sizeof(arg)); ++ fill_open(&arg, f); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_write(fuse_req_t req, size_t count) + { +- struct fuse_write_out arg; ++ struct fuse_write_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.size = count; ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size) + { +- return send_reply_ok(req, buf, size); ++ return send_reply_ok(req, buf, size); + } + + static int fuse_send_data_iov_fallback(struct fuse_session *se, +- struct fuse_chan *ch, +- struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, +- size_t len) ++ struct fuse_chan *ch, struct iovec *iov, ++ int iov_count, struct fuse_bufvec *buf, ++ size_t len) + { +- /* Optimize common case */ +- if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && +- !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { +- /* FIXME: also avoid memory copy if there are multiple buffers +- but none of them contain an fd */ ++ /* Optimize common case */ ++ if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && ++ !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { ++ /* ++ * FIXME: also avoid memory copy if there are multiple buffers ++ * but none of them contain an fd ++ */ + +- iov[iov_count].iov_base = buf->buf[0].mem; +- iov[iov_count].iov_len = len; +- iov_count++; +- return fuse_send_msg(se, ch, iov, iov_count); +- } ++ iov[iov_count].iov_base = buf->buf[0].mem; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ return fuse_send_msg(se, ch, iov, iov_count); ++ } + +- abort(); /* Will have taken vhost path */ +- return 0; ++ abort(); /* Will have taken vhost path */ ++ return 0; + } + + static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, +- struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, unsigned int flags) ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, unsigned int flags) + { +- size_t len = fuse_buf_size(buf); +- (void) flags; ++ size_t len = fuse_buf_size(buf); ++ (void)flags; + +- return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); ++ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); + } + + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++ enum fuse_buf_copy_flags flags) + { +- struct iovec iov[2]; +- struct fuse_out_header out; +- int res; ++ struct iovec iov[2]; ++ struct fuse_out_header out; ++ int res; + +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(struct fuse_out_header); ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); + +- out.unique = req->unique; +- out.error = 0; ++ out.unique = req->unique; ++ out.error = 0; + +- res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); +- if (res <= 0) { +- fuse_free_req(req); +- return res; +- } else { +- return fuse_reply_err(req, res); +- } ++ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); ++ if (res <= 0) { ++ fuse_free_req(req); ++ return res; ++ } else { ++ return fuse_reply_err(req, res); ++ } + } + + int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) + { +- struct fuse_statfs_out arg; +- size_t size = req->se->conn.proto_minor < 4 ? +- FUSE_COMPAT_STATFS_SIZE : sizeof(arg); ++ struct fuse_statfs_out arg; ++ size_t size = ++ req->se->conn.proto_minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(arg); + +- memset(&arg, 0, sizeof(arg)); +- convert_statfs(stbuf, &arg.st); ++ memset(&arg, 0, sizeof(arg)); ++ convert_statfs(stbuf, &arg.st); + +- return send_reply_ok(req, &arg, size); ++ return send_reply_ok(req, &arg, size); + } + + int fuse_reply_xattr(fuse_req_t req, size_t count) + { +- struct fuse_getxattr_out arg; ++ struct fuse_getxattr_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.size = count; ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_lock(fuse_req_t req, const struct flock *lock) + { +- struct fuse_lk_out arg; ++ struct fuse_lk_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.lk.type = lock->l_type; +- if (lock->l_type != F_UNLCK) { +- arg.lk.start = lock->l_start; +- if (lock->l_len == 0) +- arg.lk.end = OFFSET_MAX; +- else +- arg.lk.end = lock->l_start + lock->l_len - 1; +- } +- arg.lk.pid = lock->l_pid; +- return send_reply_ok(req, &arg, sizeof(arg)); ++ memset(&arg, 0, sizeof(arg)); ++ arg.lk.type = lock->l_type; ++ if (lock->l_type != F_UNLCK) { ++ arg.lk.start = lock->l_start; ++ if (lock->l_len == 0) { ++ arg.lk.end = OFFSET_MAX; ++ } else { ++ arg.lk.end = lock->l_start + lock->l_len - 1; ++ } ++ } ++ arg.lk.pid = lock->l_pid; ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_bmap(fuse_req_t req, uint64_t idx) + { +- struct fuse_bmap_out arg; ++ struct fuse_bmap_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.block = idx; ++ memset(&arg, 0, sizeof(arg)); ++ arg.block = idx; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov, +- size_t count) +-{ +- struct fuse_ioctl_iovec *fiov; +- size_t i; +- +- fiov = malloc(sizeof(fiov[0]) * count); +- if (!fiov) +- return NULL; +- +- for (i = 0; i < count; i++) { +- fiov[i].base = (uintptr_t) iov[i].iov_base; +- fiov[i].len = iov[i].iov_len; +- } +- +- return fiov; +-} +- +-int fuse_reply_ioctl_retry(fuse_req_t req, +- const struct iovec *in_iov, size_t in_count, +- const struct iovec *out_iov, size_t out_count) +-{ +- struct fuse_ioctl_out arg; +- struct fuse_ioctl_iovec *in_fiov = NULL; +- struct fuse_ioctl_iovec *out_fiov = NULL; +- struct iovec iov[4]; +- size_t count = 1; +- int res; +- +- memset(&arg, 0, sizeof(arg)); +- arg.flags |= FUSE_IOCTL_RETRY; +- arg.in_iovs = in_count; +- arg.out_iovs = out_count; +- iov[count].iov_base = &arg; +- iov[count].iov_len = sizeof(arg); +- count++; +- +- if (req->se->conn.proto_minor < 16) { +- if (in_count) { +- iov[count].iov_base = (void *)in_iov; +- iov[count].iov_len = sizeof(in_iov[0]) * in_count; +- count++; +- } +- +- if (out_count) { +- iov[count].iov_base = (void *)out_iov; +- iov[count].iov_len = sizeof(out_iov[0]) * out_count; +- count++; +- } +- } else { +- /* Can't handle non-compat 64bit ioctls on 32bit */ +- if (sizeof(void *) == 4 && req->ioctl_64bit) { +- res = fuse_reply_err(req, EINVAL); +- goto out; +- } +- +- if (in_count) { +- in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); +- if (!in_fiov) +- goto enomem; +- +- iov[count].iov_base = (void *)in_fiov; +- iov[count].iov_len = sizeof(in_fiov[0]) * in_count; +- count++; +- } +- if (out_count) { +- out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); +- if (!out_fiov) +- goto enomem; +- +- iov[count].iov_base = (void *)out_fiov; +- iov[count].iov_len = sizeof(out_fiov[0]) * out_count; +- count++; +- } +- } +- +- res = send_reply_iov(req, 0, iov, count); ++ size_t count) ++{ ++ struct fuse_ioctl_iovec *fiov; ++ size_t i; ++ ++ fiov = malloc(sizeof(fiov[0]) * count); ++ if (!fiov) { ++ return NULL; ++ } ++ ++ for (i = 0; i < count; i++) { ++ fiov[i].base = (uintptr_t)iov[i].iov_base; ++ fiov[i].len = iov[i].iov_len; ++ } ++ ++ return fiov; ++} ++ ++int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, ++ size_t in_count, const struct iovec *out_iov, ++ size_t out_count) ++{ ++ struct fuse_ioctl_out arg; ++ struct fuse_ioctl_iovec *in_fiov = NULL; ++ struct fuse_ioctl_iovec *out_fiov = NULL; ++ struct iovec iov[4]; ++ size_t count = 1; ++ int res; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.flags |= FUSE_IOCTL_RETRY; ++ arg.in_iovs = in_count; ++ arg.out_iovs = out_count; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; ++ ++ if (req->se->conn.proto_minor < 16) { ++ if (in_count) { ++ iov[count].iov_base = (void *)in_iov; ++ iov[count].iov_len = sizeof(in_iov[0]) * in_count; ++ count++; ++ } ++ ++ if (out_count) { ++ iov[count].iov_base = (void *)out_iov; ++ iov[count].iov_len = sizeof(out_iov[0]) * out_count; ++ count++; ++ } ++ } else { ++ /* Can't handle non-compat 64bit ioctls on 32bit */ ++ if (sizeof(void *) == 4 && req->ioctl_64bit) { ++ res = fuse_reply_err(req, EINVAL); ++ goto out; ++ } ++ ++ if (in_count) { ++ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); ++ if (!in_fiov) { ++ goto enomem; ++ } ++ ++ iov[count].iov_base = (void *)in_fiov; ++ iov[count].iov_len = sizeof(in_fiov[0]) * in_count; ++ count++; ++ } ++ if (out_count) { ++ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); ++ if (!out_fiov) { ++ goto enomem; ++ } ++ ++ iov[count].iov_base = (void *)out_fiov; ++ iov[count].iov_len = sizeof(out_fiov[0]) * out_count; ++ count++; ++ } ++ } ++ ++ res = send_reply_iov(req, 0, iov, count); + out: +- free(in_fiov); +- free(out_fiov); ++ free(in_fiov); ++ free(out_fiov); + +- return res; ++ return res; + + enomem: +- res = fuse_reply_err(req, ENOMEM); +- goto out; ++ res = fuse_reply_err(req, ENOMEM); ++ goto out; + } + + int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size) + { +- struct fuse_ioctl_out arg; +- struct iovec iov[3]; +- size_t count = 1; ++ struct fuse_ioctl_out arg; ++ struct iovec iov[3]; ++ size_t count = 1; + +- memset(&arg, 0, sizeof(arg)); +- arg.result = result; +- iov[count].iov_base = &arg; +- iov[count].iov_len = sizeof(arg); +- count++; ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; + +- if (size) { +- iov[count].iov_base = (char *) buf; +- iov[count].iov_len = size; +- count++; +- } ++ if (size) { ++ iov[count].iov_base = (char *)buf; ++ iov[count].iov_len = size; ++ count++; ++ } + +- return send_reply_iov(req, 0, iov, count); ++ return send_reply_iov(req, 0, iov, count); + } + + int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, +- int count) ++ int count) + { +- struct iovec *padded_iov; +- struct fuse_ioctl_out arg; +- int res; ++ struct iovec *padded_iov; ++ struct fuse_ioctl_out arg; ++ int res; + +- padded_iov = malloc((count + 2) * sizeof(struct iovec)); +- if (padded_iov == NULL) +- return fuse_reply_err(req, ENOMEM); ++ padded_iov = malloc((count + 2) * sizeof(struct iovec)); ++ if (padded_iov == NULL) { ++ return fuse_reply_err(req, ENOMEM); ++ } + +- memset(&arg, 0, sizeof(arg)); +- arg.result = result; +- padded_iov[1].iov_base = &arg; +- padded_iov[1].iov_len = sizeof(arg); ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ padded_iov[1].iov_base = &arg; ++ padded_iov[1].iov_len = sizeof(arg); + +- memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); ++ memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); + +- res = send_reply_iov(req, 0, padded_iov, count + 2); +- free(padded_iov); ++ res = send_reply_iov(req, 0, padded_iov, count + 2); ++ free(padded_iov); + +- return res; ++ return res; + } + + int fuse_reply_poll(fuse_req_t req, unsigned revents) + { +- struct fuse_poll_out arg; ++ struct fuse_poll_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.revents = revents; ++ memset(&arg, 0, sizeof(arg)); ++ arg.revents = revents; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_lseek(fuse_req_t req, off_t off) + { +- struct fuse_lseek_out arg; ++ struct fuse_lseek_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.offset = off; ++ memset(&arg, 0, sizeof(arg)); ++ arg.offset = off; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.lookup) +- req->se->op.lookup(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.lookup) { ++ req->se->op.lookup(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_forget_in *arg = (struct fuse_forget_in *) inarg; ++ struct fuse_forget_in *arg = (struct fuse_forget_in *)inarg; + +- if (req->se->op.forget) +- req->se->op.forget(req, nodeid, arg->nlookup); +- else +- fuse_reply_none(req); ++ if (req->se->op.forget) { ++ req->se->op.forget(req, nodeid, arg->nlookup); ++ } else { ++ fuse_reply_none(req); ++ } + } + + static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg) ++ const void *inarg) + { +- struct fuse_batch_forget_in *arg = (void *) inarg; +- struct fuse_forget_one *param = (void *) PARAM(arg); +- unsigned int i; ++ struct fuse_batch_forget_in *arg = (void *)inarg; ++ struct fuse_forget_one *param = (void *)PARAM(arg); ++ unsigned int i; + +- (void) nodeid; ++ (void)nodeid; + +- if (req->se->op.forget_multi) { +- req->se->op.forget_multi(req, arg->count, +- (struct fuse_forget_data *) param); +- } else if (req->se->op.forget) { +- for (i = 0; i < arg->count; i++) { +- struct fuse_forget_one *forget = ¶m[i]; +- struct fuse_req *dummy_req; ++ if (req->se->op.forget_multi) { ++ req->se->op.forget_multi(req, arg->count, ++ (struct fuse_forget_data *)param); ++ } else if (req->se->op.forget) { ++ for (i = 0; i < arg->count; i++) { ++ struct fuse_forget_one *forget = ¶m[i]; ++ struct fuse_req *dummy_req; + +- dummy_req = fuse_ll_alloc_req(req->se); +- if (dummy_req == NULL) +- break; ++ dummy_req = fuse_ll_alloc_req(req->se); ++ if (dummy_req == NULL) { ++ break; ++ } + +- dummy_req->unique = req->unique; +- dummy_req->ctx = req->ctx; +- dummy_req->ch = NULL; ++ dummy_req->unique = req->unique; ++ dummy_req->ctx = req->ctx; ++ dummy_req->ch = NULL; + +- req->se->op.forget(dummy_req, forget->nodeid, +- forget->nlookup); +- } +- fuse_reply_none(req); +- } else { +- fuse_reply_none(req); +- } ++ req->se->op.forget(dummy_req, forget->nodeid, forget->nlookup); ++ } ++ fuse_reply_none(req); ++ } else { ++ fuse_reply_none(req); ++ } + } + + static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_file_info *fip = NULL; +- struct fuse_file_info fi; ++ struct fuse_file_info *fip = NULL; ++ struct fuse_file_info fi; + +- if (req->se->conn.proto_minor >= 9) { +- struct fuse_getattr_in *arg = (struct fuse_getattr_in *) inarg; ++ if (req->se->conn.proto_minor >= 9) { ++ struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; + +- if (arg->getattr_flags & FUSE_GETATTR_FH) { +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fip = &fi; +- } +- } ++ if (arg->getattr_flags & FUSE_GETATTR_FH) { ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fip = &fi; ++ } ++ } + +- if (req->se->op.getattr) +- req->se->op.getattr(req, nodeid, fip); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.getattr) { ++ req->se->op.getattr(req, nodeid, fip); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_setattr_in *arg = (struct fuse_setattr_in *) inarg; +- +- if (req->se->op.setattr) { +- struct fuse_file_info *fi = NULL; +- struct fuse_file_info fi_store; +- struct stat stbuf; +- memset(&stbuf, 0, sizeof(stbuf)); +- convert_attr(arg, &stbuf); +- if (arg->valid & FATTR_FH) { +- arg->valid &= ~FATTR_FH; +- memset(&fi_store, 0, sizeof(fi_store)); +- fi = &fi_store; +- fi->fh = arg->fh; +- } +- arg->valid &= +- FUSE_SET_ATTR_MODE | +- FUSE_SET_ATTR_UID | +- FUSE_SET_ATTR_GID | +- FUSE_SET_ATTR_SIZE | +- FUSE_SET_ATTR_ATIME | +- FUSE_SET_ATTR_MTIME | +- FUSE_SET_ATTR_ATIME_NOW | +- FUSE_SET_ATTR_MTIME_NOW | +- FUSE_SET_ATTR_CTIME; +- +- req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); +- } else +- fuse_reply_err(req, ENOSYS); ++ struct fuse_setattr_in *arg = (struct fuse_setattr_in *)inarg; ++ ++ if (req->se->op.setattr) { ++ struct fuse_file_info *fi = NULL; ++ struct fuse_file_info fi_store; ++ struct stat stbuf; ++ memset(&stbuf, 0, sizeof(stbuf)); ++ convert_attr(arg, &stbuf); ++ if (arg->valid & FATTR_FH) { ++ arg->valid &= ~FATTR_FH; ++ memset(&fi_store, 0, sizeof(fi_store)); ++ fi = &fi_store; ++ fi->fh = arg->fh; ++ } ++ arg->valid &= FUSE_SET_ATTR_MODE | FUSE_SET_ATTR_UID | ++ FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE | ++ FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME | ++ FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW | ++ FUSE_SET_ATTR_CTIME; ++ ++ req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_access_in *arg = (struct fuse_access_in *) inarg; ++ struct fuse_access_in *arg = (struct fuse_access_in *)inarg; + +- if (req->se->op.access) +- req->se->op.access(req, nodeid, arg->mask); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.access) { ++ req->se->op.access(req, nodeid, arg->mask); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- (void) inarg; ++ (void)inarg; + +- if (req->se->op.readlink) +- req->se->op.readlink(req, nodeid); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.readlink) { ++ req->se->op.readlink(req, nodeid); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_mknod_in *arg = (struct fuse_mknod_in *) inarg; +- char *name = PARAM(arg); ++ struct fuse_mknod_in *arg = (struct fuse_mknod_in *)inarg; ++ char *name = PARAM(arg); + +- if (req->se->conn.proto_minor >= 12) +- req->ctx.umask = arg->umask; +- else +- name = (char *) inarg + FUSE_COMPAT_MKNOD_IN_SIZE; ++ if (req->se->conn.proto_minor >= 12) { ++ req->ctx.umask = arg->umask; ++ } else { ++ name = (char *)inarg + FUSE_COMPAT_MKNOD_IN_SIZE; ++ } + +- if (req->se->op.mknod) +- req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.mknod) { ++ req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *) inarg; ++ struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *)inarg; + +- if (req->se->conn.proto_minor >= 12) +- req->ctx.umask = arg->umask; ++ if (req->se->conn.proto_minor >= 12) { ++ req->ctx.umask = arg->umask; ++ } + +- if (req->se->op.mkdir) +- req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.mkdir) { ++ req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.unlink) +- req->se->op.unlink(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.unlink) { ++ req->se->op.unlink(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.rmdir) +- req->se->op.rmdir(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.rmdir) { ++ req->se->op.rmdir(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; +- char *linkname = ((char *) inarg) + strlen((char *) inarg) + 1; ++ char *name = (char *)inarg; ++ char *linkname = ((char *)inarg) + strlen((char *)inarg) + 1; + +- if (req->se->op.symlink) +- req->se->op.symlink(req, linkname, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.symlink) { ++ req->se->op.symlink(req, linkname, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_rename_in *arg = (struct fuse_rename_in *) inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename_in *arg = (struct fuse_rename_in *)inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; + +- if (req->se->op.rename) +- req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, +- 0); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.rename) { ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_rename2_in *arg = (struct fuse_rename2_in *) inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename2_in *arg = (struct fuse_rename2_in *)inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; + +- if (req->se->op.rename) +- req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, +- arg->flags); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.rename) { ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, ++ arg->flags); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_link(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_link_in *arg = (struct fuse_link_in *) inarg; ++ struct fuse_link_in *arg = (struct fuse_link_in *)inarg; + +- if (req->se->op.link) +- req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.link) { ++ req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_create_in *arg = (struct fuse_create_in *) inarg; ++ struct fuse_create_in *arg = (struct fuse_create_in *)inarg; + +- if (req->se->op.create) { +- struct fuse_file_info fi; +- char *name = PARAM(arg); ++ if (req->se->op.create) { ++ struct fuse_file_info fi; ++ char *name = PARAM(arg); + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; + +- if (req->se->conn.proto_minor >= 12) +- req->ctx.umask = arg->umask; +- else +- name = (char *) inarg + sizeof(struct fuse_open_in); ++ if (req->se->conn.proto_minor >= 12) { ++ req->ctx.umask = arg->umask; ++ } else { ++ name = (char *)inarg + sizeof(struct fuse_open_in); ++ } + +- req->se->op.create(req, nodeid, name, arg->mode, &fi); +- } else +- fuse_reply_err(req, ENOSYS); ++ req->se->op.create(req, nodeid, name, arg->mode, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_open_in *arg = (struct fuse_open_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; + +- if (req->se->op.open) +- req->se->op.open(req, nodeid, &fi); +- else +- fuse_reply_open(req, &fi); ++ if (req->se->op.open) { ++ req->se->op.open(req, nodeid, &fi); ++ } else { ++ fuse_reply_open(req, &fi); ++ } + } + + static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ struct fuse_read_in *arg = (struct fuse_read_in *)inarg; + +- if (req->se->op.read) { +- struct fuse_file_info fi; ++ if (req->se->op.read) { ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 9) { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- } +- req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); +- } else +- fuse_reply_err(req, ENOSYS); ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 9) { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ } ++ req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_write_in *arg = (struct fuse_write_in *) inarg; +- struct fuse_file_info fi; +- char *param; ++ struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_file_info fi; ++ char *param; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; + +- if (req->se->conn.proto_minor < 9) { +- param = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- param = PARAM(arg); +- } ++ if (req->se->conn.proto_minor < 9) { ++ param = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ param = PARAM(arg); ++ } + +- if (req->se->op.write) +- req->se->op.write(req, nodeid, param, arg->size, +- arg->offset, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.write) { ++ req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, +- const struct fuse_buf *ibuf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, +- .count = 1, +- }; +- struct fuse_write_in *arg = (struct fuse_write_in *) inarg; +- struct fuse_file_info fi; +- +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; +- +- if (se->conn.proto_minor < 9) { +- bufv.buf[0].mem = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- bufv.buf[0].size -= sizeof(struct fuse_in_header) + +- FUSE_COMPAT_WRITE_IN_SIZE; +- assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) +- bufv.buf[0].mem = PARAM(arg); +- +- bufv.buf[0].size -= sizeof(struct fuse_in_header) + +- sizeof(struct fuse_write_in); +- } +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); +- fuse_reply_err(req, EIO); +- return; +- } +- bufv.buf[0].size = arg->size; +- +- se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; ++ ++ if (se->conn.proto_minor < 9) { ++ bufv.buf[0].mem = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + FUSE_COMPAT_WRITE_IN_SIZE; ++ assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ bufv.buf[0].mem = PARAM(arg); ++ } ++ ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); ++ } ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); + } + + static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_flush_in *arg = (struct fuse_flush_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_flush_in *arg = (struct fuse_flush_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.flush = 1; +- if (req->se->conn.proto_minor >= 7) +- fi.lock_owner = arg->lock_owner; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.flush = 1; ++ if (req->se->conn.proto_minor >= 7) { ++ fi.lock_owner = arg->lock_owner; ++ } + +- if (req->se->op.flush) +- req->se->op.flush(req, nodeid, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.flush) { ++ req->se->op.flush(req, nodeid, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_release_in *arg = (struct fuse_release_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; +- fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 8) { +- fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; +- fi.lock_owner = arg->lock_owner; +- } +- if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { +- fi.flock_release = 1; +- fi.lock_owner = arg->lock_owner; +- } ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 8) { ++ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; ++ fi.lock_owner = arg->lock_owner; ++ } ++ if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { ++ fi.flock_release = 1; ++ fi.lock_owner = arg->lock_owner; ++ } + +- if (req->se->op.release) +- req->se->op.release(req, nodeid, &fi); +- else +- fuse_reply_err(req, 0); ++ if (req->se->op.release) { ++ req->se->op.release(req, nodeid, &fi); ++ } else { ++ fuse_reply_err(req, 0); ++ } + } + + static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; +- struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.fsync) +- req->se->op.fsync(req, nodeid, datasync, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.fsync) { ++ req->se->op.fsync(req, nodeid, datasync, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_open_in *arg = (struct fuse_open_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; + +- if (req->se->op.opendir) +- req->se->op.opendir(req, nodeid, &fi); +- else +- fuse_reply_open(req, &fi); ++ if (req->se->op.opendir) { ++ req->se->op.opendir(req, nodeid, &fi); ++ } else { ++ fuse_reply_open(req, &fi); ++ } + } + + static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_read_in *arg = (struct fuse_read_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.readdir) +- req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.readdir) { ++ req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_read_in *arg = (struct fuse_read_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.readdirplus) +- req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.readdirplus) { ++ req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_release_in *arg = (struct fuse_release_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; + +- if (req->se->op.releasedir) +- req->se->op.releasedir(req, nodeid, &fi); +- else +- fuse_reply_err(req, 0); ++ if (req->se->op.releasedir) { ++ req->se->op.releasedir(req, nodeid, &fi); ++ } else { ++ fuse_reply_err(req, 0); ++ } + } + + static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; +- struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.fsyncdir) +- req->se->op.fsyncdir(req, nodeid, datasync, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.fsyncdir) { ++ req->se->op.fsyncdir(req, nodeid, datasync, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- (void) nodeid; +- (void) inarg; ++ (void)nodeid; ++ (void)inarg; + +- if (req->se->op.statfs) +- req->se->op.statfs(req, nodeid); +- else { +- struct statvfs buf = { +- .f_namemax = 255, +- .f_bsize = 512, +- }; +- fuse_reply_statfs(req, &buf); +- } ++ if (req->se->op.statfs) { ++ req->se->op.statfs(req, nodeid); ++ } else { ++ struct statvfs buf = { ++ .f_namemax = 255, ++ .f_bsize = 512, ++ }; ++ fuse_reply_statfs(req, &buf); ++ } + } + + static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *) inarg; +- char *name = PARAM(arg); +- char *value = name + strlen(name) + 1; ++ struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *)inarg; ++ char *name = PARAM(arg); ++ char *value = name + strlen(name) + 1; + +- if (req->se->op.setxattr) +- req->se->op.setxattr(req, nodeid, name, value, arg->size, +- arg->flags); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.setxattr) { ++ req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; + +- if (req->se->op.getxattr) +- req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.getxattr) { ++ req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; + +- if (req->se->op.listxattr) +- req->se->op.listxattr(req, nodeid, arg->size); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.listxattr) { ++ req->se->op.listxattr(req, nodeid, arg->size); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.removexattr) +- req->se->op.removexattr(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.removexattr) { ++ req->se->op.removexattr(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void convert_fuse_file_lock(struct fuse_file_lock *fl, +- struct flock *flock) ++ struct flock *flock) + { +- memset(flock, 0, sizeof(struct flock)); +- flock->l_type = fl->type; +- flock->l_whence = SEEK_SET; +- flock->l_start = fl->start; +- if (fl->end == OFFSET_MAX) +- flock->l_len = 0; +- else +- flock->l_len = fl->end - fl->start + 1; +- flock->l_pid = fl->pid; ++ memset(flock, 0, sizeof(struct flock)); ++ flock->l_type = fl->type; ++ flock->l_whence = SEEK_SET; ++ flock->l_start = fl->start; ++ if (fl->end == OFFSET_MAX) { ++ flock->l_len = 0; ++ } else { ++ flock->l_len = fl->end - fl->start + 1; ++ } ++ flock->l_pid = fl->pid; + } + + static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; +- struct fuse_file_info fi; +- struct flock flock; ++ struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_file_info fi; ++ struct flock flock; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.lock_owner = arg->owner; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; + +- convert_fuse_file_lock(&arg->lk, &flock); +- if (req->se->op.getlk) +- req->se->op.getlk(req, nodeid, &fi, &flock); +- else +- fuse_reply_err(req, ENOSYS); ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.getlk) { ++ req->se->op.getlk(req, nodeid, &fi, &flock); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, int sleep) +-{ +- struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; +- struct fuse_file_info fi; +- struct flock flock; +- +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.lock_owner = arg->owner; +- +- if (arg->lk_flags & FUSE_LK_FLOCK) { +- int op = 0; +- +- switch (arg->lk.type) { +- case F_RDLCK: +- op = LOCK_SH; +- break; +- case F_WRLCK: +- op = LOCK_EX; +- break; +- case F_UNLCK: +- op = LOCK_UN; +- break; +- } +- if (!sleep) +- op |= LOCK_NB; +- +- if (req->se->op.flock) +- req->se->op.flock(req, nodeid, &fi, op); +- else +- fuse_reply_err(req, ENOSYS); +- } else { +- convert_fuse_file_lock(&arg->lk, &flock); +- if (req->se->op.setlk) +- req->se->op.setlk(req, nodeid, &fi, &flock, sleep); +- else +- fuse_reply_err(req, ENOSYS); +- } ++ const void *inarg, int sleep) ++{ ++ struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_file_info fi; ++ struct flock flock; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; ++ ++ if (arg->lk_flags & FUSE_LK_FLOCK) { ++ int op = 0; ++ ++ switch (arg->lk.type) { ++ case F_RDLCK: ++ op = LOCK_SH; ++ break; ++ case F_WRLCK: ++ op = LOCK_EX; ++ break; ++ case F_UNLCK: ++ op = LOCK_UN; ++ break; ++ } ++ if (!sleep) { ++ op |= LOCK_NB; ++ } ++ ++ if (req->se->op.flock) { ++ req->se->op.flock(req, nodeid, &fi, op); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } ++ } else { ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.setlk) { ++ req->se->op.setlk(req, nodeid, &fi, &flock, sleep); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } ++ } + } + + static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- do_setlk_common(req, nodeid, inarg, 0); ++ do_setlk_common(req, nodeid, inarg, 0); + } + + static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- do_setlk_common(req, nodeid, inarg, 1); ++ do_setlk_common(req, nodeid, inarg, 1); + } + + static int find_interrupted(struct fuse_session *se, struct fuse_req *req) + { +- struct fuse_req *curr; +- +- for (curr = se->list.next; curr != &se->list; curr = curr->next) { +- if (curr->unique == req->u.i.unique) { +- fuse_interrupt_func_t func; +- void *data; +- +- curr->ctr++; +- pthread_mutex_unlock(&se->lock); +- +- /* Ugh, ugly locking */ +- pthread_mutex_lock(&curr->lock); +- pthread_mutex_lock(&se->lock); +- curr->interrupted = 1; +- func = curr->u.ni.func; +- data = curr->u.ni.data; +- pthread_mutex_unlock(&se->lock); +- if (func) +- func(curr, data); +- pthread_mutex_unlock(&curr->lock); +- +- pthread_mutex_lock(&se->lock); +- curr->ctr--; +- if (!curr->ctr) +- destroy_req(curr); +- +- return 1; +- } +- } +- for (curr = se->interrupts.next; curr != &se->interrupts; +- curr = curr->next) { +- if (curr->u.i.unique == req->u.i.unique) +- return 1; +- } +- return 0; ++ struct fuse_req *curr; ++ ++ for (curr = se->list.next; curr != &se->list; curr = curr->next) { ++ if (curr->unique == req->u.i.unique) { ++ fuse_interrupt_func_t func; ++ void *data; ++ ++ curr->ctr++; ++ pthread_mutex_unlock(&se->lock); ++ ++ /* Ugh, ugly locking */ ++ pthread_mutex_lock(&curr->lock); ++ pthread_mutex_lock(&se->lock); ++ curr->interrupted = 1; ++ func = curr->u.ni.func; ++ data = curr->u.ni.data; ++ pthread_mutex_unlock(&se->lock); ++ if (func) { ++ func(curr, data); ++ } ++ pthread_mutex_unlock(&curr->lock); ++ ++ pthread_mutex_lock(&se->lock); ++ curr->ctr--; ++ if (!curr->ctr) { ++ destroy_req(curr); ++ } ++ ++ return 1; ++ } ++ } ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->u.i.unique) { ++ return 1; ++ } ++ } ++ return 0; + } + + static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *) inarg; +- struct fuse_session *se = req->se; ++ struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *)inarg; ++ struct fuse_session *se = req->se; + +- (void) nodeid; +- if (se->debug) +- fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", +- (unsigned long long) arg->unique); ++ (void)nodeid; ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", ++ (unsigned long long)arg->unique); ++ } + +- req->u.i.unique = arg->unique; ++ req->u.i.unique = arg->unique; + +- pthread_mutex_lock(&se->lock); +- if (find_interrupted(se, req)) +- destroy_req(req); +- else +- list_add_req(req, &se->interrupts); +- pthread_mutex_unlock(&se->lock); ++ pthread_mutex_lock(&se->lock); ++ if (find_interrupted(se, req)) { ++ destroy_req(req); ++ } else { ++ list_add_req(req, &se->interrupts); ++ } ++ pthread_mutex_unlock(&se->lock); + } + + static struct fuse_req *check_interrupt(struct fuse_session *se, +- struct fuse_req *req) +-{ +- struct fuse_req *curr; +- +- for (curr = se->interrupts.next; curr != &se->interrupts; +- curr = curr->next) { +- if (curr->u.i.unique == req->unique) { +- req->interrupted = 1; +- list_del_req(curr); +- free(curr); +- return NULL; +- } +- } +- curr = se->interrupts.next; +- if (curr != &se->interrupts) { +- list_del_req(curr); +- list_init_req(curr); +- return curr; +- } else +- return NULL; ++ struct fuse_req *req) ++{ ++ struct fuse_req *curr; ++ ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->unique) { ++ req->interrupted = 1; ++ list_del_req(curr); ++ free(curr); ++ return NULL; ++ } ++ } ++ curr = se->interrupts.next; ++ if (curr != &se->interrupts) { ++ list_del_req(curr); ++ list_init_req(curr); ++ return curr; ++ } else { ++ return NULL; ++ } + } + + static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_bmap_in *arg = (struct fuse_bmap_in *) inarg; ++ struct fuse_bmap_in *arg = (struct fuse_bmap_in *)inarg; + +- if (req->se->op.bmap) +- req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.bmap) { ++ req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *) inarg; +- unsigned int flags = arg->flags; +- void *in_buf = arg->in_size ? PARAM(arg) : NULL; +- struct fuse_file_info fi; ++ struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *)inarg; ++ unsigned int flags = arg->flags; ++ void *in_buf = arg->in_size ? PARAM(arg) : NULL; ++ struct fuse_file_info fi; + +- if (flags & FUSE_IOCTL_DIR && +- !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { +- fuse_reply_err(req, ENOTTY); +- return; +- } ++ if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { ++ fuse_reply_err(req, ENOTTY); ++ return; ++ } + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && +- !(flags & FUSE_IOCTL_32BIT)) { +- req->ioctl_64bit = 1; +- } ++ if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && ++ !(flags & FUSE_IOCTL_32BIT)) { ++ req->ioctl_64bit = 1; ++ } + +- if (req->se->op.ioctl) +- req->se->op.ioctl(req, nodeid, arg->cmd, +- (void *)(uintptr_t)arg->arg, &fi, flags, +- in_buf, arg->in_size, arg->out_size); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.ioctl) { ++ req->se->op.ioctl(req, nodeid, arg->cmd, (void *)(uintptr_t)arg->arg, ++ &fi, flags, in_buf, arg->in_size, arg->out_size); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) + { +- free(ph); ++ free(ph); + } + + static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_poll_in *arg = (struct fuse_poll_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_poll_in *arg = (struct fuse_poll_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.poll_events = arg->events; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.poll_events = arg->events; + +- if (req->se->op.poll) { +- struct fuse_pollhandle *ph = NULL; ++ if (req->se->op.poll) { ++ struct fuse_pollhandle *ph = NULL; + +- if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { +- ph = malloc(sizeof(struct fuse_pollhandle)); +- if (ph == NULL) { +- fuse_reply_err(req, ENOMEM); +- return; +- } +- ph->kh = arg->kh; +- ph->se = req->se; +- } ++ if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { ++ ph = malloc(sizeof(struct fuse_pollhandle)); ++ if (ph == NULL) { ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ph->kh = arg->kh; ++ ph->se = req->se; ++ } + +- req->se->op.poll(req, nodeid, &fi, ph); +- } else { +- fuse_reply_err(req, ENOSYS); +- } ++ req->se->op.poll(req, nodeid, &fi, ph); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.fallocate) +- req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.fallocate) { ++ req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, ++ &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + +-static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, const void *inarg) ++static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, ++ const void *inarg) + { +- struct fuse_copy_file_range_in *arg = (struct fuse_copy_file_range_in *) inarg; +- struct fuse_file_info fi_in, fi_out; ++ struct fuse_copy_file_range_in *arg = ++ (struct fuse_copy_file_range_in *)inarg; ++ struct fuse_file_info fi_in, fi_out; + +- memset(&fi_in, 0, sizeof(fi_in)); +- fi_in.fh = arg->fh_in; ++ memset(&fi_in, 0, sizeof(fi_in)); ++ fi_in.fh = arg->fh_in; + +- memset(&fi_out, 0, sizeof(fi_out)); +- fi_out.fh = arg->fh_out; ++ memset(&fi_out, 0, sizeof(fi_out)); ++ fi_out.fh = arg->fh_out; + + +- if (req->se->op.copy_file_range) +- req->se->op.copy_file_range(req, nodeid_in, arg->off_in, +- &fi_in, arg->nodeid_out, +- arg->off_out, &fi_out, arg->len, +- arg->flags); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.copy_file_range) { ++ req->se->op.copy_file_range(req, nodeid_in, arg->off_in, &fi_in, ++ arg->nodeid_out, arg->off_out, &fi_out, ++ arg->len, arg->flags); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_lseek_in *arg = (struct fuse_lseek_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_lseek_in *arg = (struct fuse_lseek_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.lseek) +- req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.lseek) { ++ req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_init_in *arg = (struct fuse_init_in *) inarg; +- struct fuse_init_out outarg; +- struct fuse_session *se = req->se; +- size_t bufsize = se->bufsize; +- size_t outargsize = sizeof(outarg); +- +- (void) nodeid; +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); +- if (arg->major == 7 && arg->minor >= 6) { +- fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); +- fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", +- arg->max_readahead); +- } +- } +- se->conn.proto_major = arg->major; +- se->conn.proto_minor = arg->minor; +- se->conn.capable = 0; +- se->conn.want = 0; +- +- memset(&outarg, 0, sizeof(outarg)); +- outarg.major = FUSE_KERNEL_VERSION; +- outarg.minor = FUSE_KERNEL_MINOR_VERSION; +- +- if (arg->major < 7) { +- fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", +- arg->major, arg->minor); +- fuse_reply_err(req, EPROTO); +- return; +- } +- +- if (arg->major > 7) { +- /* Wait for a second INIT request with a 7.X version */ +- send_reply_ok(req, &outarg, sizeof(outarg)); +- return; +- } +- +- if (arg->minor >= 6) { +- if (arg->max_readahead < se->conn.max_readahead) +- se->conn.max_readahead = arg->max_readahead; +- if (arg->flags & FUSE_ASYNC_READ) +- se->conn.capable |= FUSE_CAP_ASYNC_READ; +- if (arg->flags & FUSE_POSIX_LOCKS) +- se->conn.capable |= FUSE_CAP_POSIX_LOCKS; +- if (arg->flags & FUSE_ATOMIC_O_TRUNC) +- se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; +- if (arg->flags & FUSE_EXPORT_SUPPORT) +- se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; +- if (arg->flags & FUSE_DONT_MASK) +- se->conn.capable |= FUSE_CAP_DONT_MASK; +- if (arg->flags & FUSE_FLOCK_LOCKS) +- se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; +- if (arg->flags & FUSE_AUTO_INVAL_DATA) +- se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; +- if (arg->flags & FUSE_DO_READDIRPLUS) +- se->conn.capable |= FUSE_CAP_READDIRPLUS; +- if (arg->flags & FUSE_READDIRPLUS_AUTO) +- se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; +- if (arg->flags & FUSE_ASYNC_DIO) +- se->conn.capable |= FUSE_CAP_ASYNC_DIO; +- if (arg->flags & FUSE_WRITEBACK_CACHE) +- se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; +- if (arg->flags & FUSE_NO_OPEN_SUPPORT) +- se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; +- if (arg->flags & FUSE_PARALLEL_DIROPS) +- se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; +- if (arg->flags & FUSE_POSIX_ACL) +- se->conn.capable |= FUSE_CAP_POSIX_ACL; +- if (arg->flags & FUSE_HANDLE_KILLPRIV) +- se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; +- if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) +- se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; +- if (!(arg->flags & FUSE_MAX_PAGES)) { +- size_t max_bufsize = +- FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() +- + FUSE_BUFFER_HEADER_SIZE; +- if (bufsize > max_bufsize) { +- bufsize = max_bufsize; +- } +- } +- } else { +- se->conn.max_readahead = 0; +- } +- +- if (se->conn.proto_minor >= 14) { ++ struct fuse_init_in *arg = (struct fuse_init_in *)inarg; ++ struct fuse_init_out outarg; ++ struct fuse_session *se = req->se; ++ size_t bufsize = se->bufsize; ++ size_t outargsize = sizeof(outarg); ++ ++ (void)nodeid; ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); ++ if (arg->major == 7 && arg->minor >= 6) { ++ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); ++ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", ++ arg->max_readahead); ++ } ++ } ++ se->conn.proto_major = arg->major; ++ se->conn.proto_minor = arg->minor; ++ se->conn.capable = 0; ++ se->conn.want = 0; ++ ++ memset(&outarg, 0, sizeof(outarg)); ++ outarg.major = FUSE_KERNEL_VERSION; ++ outarg.minor = FUSE_KERNEL_MINOR_VERSION; ++ ++ if (arg->major < 7) { ++ fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", ++ arg->major, arg->minor); ++ fuse_reply_err(req, EPROTO); ++ return; ++ } ++ ++ if (arg->major > 7) { ++ /* Wait for a second INIT request with a 7.X version */ ++ send_reply_ok(req, &outarg, sizeof(outarg)); ++ return; ++ } ++ ++ if (arg->minor >= 6) { ++ if (arg->max_readahead < se->conn.max_readahead) { ++ se->conn.max_readahead = arg->max_readahead; ++ } ++ if (arg->flags & FUSE_ASYNC_READ) { ++ se->conn.capable |= FUSE_CAP_ASYNC_READ; ++ } ++ if (arg->flags & FUSE_POSIX_LOCKS) { ++ se->conn.capable |= FUSE_CAP_POSIX_LOCKS; ++ } ++ if (arg->flags & FUSE_ATOMIC_O_TRUNC) { ++ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; ++ } ++ if (arg->flags & FUSE_EXPORT_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; ++ } ++ if (arg->flags & FUSE_DONT_MASK) { ++ se->conn.capable |= FUSE_CAP_DONT_MASK; ++ } ++ if (arg->flags & FUSE_FLOCK_LOCKS) { ++ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; ++ } ++ if (arg->flags & FUSE_AUTO_INVAL_DATA) { ++ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; ++ } ++ if (arg->flags & FUSE_DO_READDIRPLUS) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS; ++ } ++ if (arg->flags & FUSE_READDIRPLUS_AUTO) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; ++ } ++ if (arg->flags & FUSE_ASYNC_DIO) { ++ se->conn.capable |= FUSE_CAP_ASYNC_DIO; ++ } ++ if (arg->flags & FUSE_WRITEBACK_CACHE) { ++ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (arg->flags & FUSE_NO_OPEN_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; ++ } ++ if (arg->flags & FUSE_PARALLEL_DIROPS) { ++ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; ++ } ++ if (arg->flags & FUSE_POSIX_ACL) { ++ se->conn.capable |= FUSE_CAP_POSIX_ACL; ++ } ++ if (arg->flags & FUSE_HANDLE_KILLPRIV) { ++ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; ++ } ++ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; ++ } ++ if (!(arg->flags & FUSE_MAX_PAGES)) { ++ size_t max_bufsize = ++ FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + ++ FUSE_BUFFER_HEADER_SIZE; ++ if (bufsize > max_bufsize) { ++ bufsize = max_bufsize; ++ } ++ } ++ } else { ++ se->conn.max_readahead = 0; ++ } ++ ++ if (se->conn.proto_minor >= 14) { + #ifdef HAVE_SPLICE + #ifdef HAVE_VMSPLICE +- se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; ++ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; + #endif +- se->conn.capable |= FUSE_CAP_SPLICE_READ; ++ se->conn.capable |= FUSE_CAP_SPLICE_READ; + #endif +- } +- if (se->conn.proto_minor >= 18) +- se->conn.capable |= FUSE_CAP_IOCTL_DIR; +- +- /* Default settings for modern filesystems. +- * +- * Most of these capabilities were disabled by default in +- * libfuse2 for backwards compatibility reasons. In libfuse3, +- * we can finally enable them by default (as long as they're +- * supported by the kernel). +- */ +-#define LL_SET_DEFAULT(cond, cap) \ +- if ((cond) && (se->conn.capable & (cap))) \ +- se->conn.want |= (cap) +- LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); +- LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); +- LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); +- LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); +- LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); +- LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); +- LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); +- LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); +- LL_SET_DEFAULT(se->op.getlk && se->op.setlk, +- FUSE_CAP_POSIX_LOCKS); +- LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); +- LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); +- LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, +- FUSE_CAP_READDIRPLUS_AUTO); +- se->conn.time_gran = 1; +- +- if (bufsize < FUSE_MIN_READ_BUFFER) { +- fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", +- bufsize); +- bufsize = FUSE_MIN_READ_BUFFER; +- } +- se->bufsize = bufsize; +- +- if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) +- se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; +- +- se->got_init = 1; +- if (se->op.init) +- se->op.init(se->userdata, &se->conn); +- +- if (se->conn.want & (~se->conn.capable)) { +- fuse_log(FUSE_LOG_ERR, "fuse: error: filesystem requested capabilities " +- "0x%x that are not supported by kernel, aborting.\n", +- se->conn.want & (~se->conn.capable)); +- fuse_reply_err(req, EPROTO); +- se->error = -EPROTO; +- fuse_session_exit(se); +- return; +- } +- +- if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { +- se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; +- } +- if (arg->flags & FUSE_MAX_PAGES) { +- outarg.flags |= FUSE_MAX_PAGES; +- outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; +- } +- +- /* Always enable big writes, this is superseded +- by the max_write option */ +- outarg.flags |= FUSE_BIG_WRITES; +- +- if (se->conn.want & FUSE_CAP_ASYNC_READ) +- outarg.flags |= FUSE_ASYNC_READ; +- if (se->conn.want & FUSE_CAP_POSIX_LOCKS) +- outarg.flags |= FUSE_POSIX_LOCKS; +- if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) +- outarg.flags |= FUSE_ATOMIC_O_TRUNC; +- if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) +- outarg.flags |= FUSE_EXPORT_SUPPORT; +- if (se->conn.want & FUSE_CAP_DONT_MASK) +- outarg.flags |= FUSE_DONT_MASK; +- if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) +- outarg.flags |= FUSE_FLOCK_LOCKS; +- if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) +- outarg.flags |= FUSE_AUTO_INVAL_DATA; +- if (se->conn.want & FUSE_CAP_READDIRPLUS) +- outarg.flags |= FUSE_DO_READDIRPLUS; +- if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) +- outarg.flags |= FUSE_READDIRPLUS_AUTO; +- if (se->conn.want & FUSE_CAP_ASYNC_DIO) +- outarg.flags |= FUSE_ASYNC_DIO; +- if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) +- outarg.flags |= FUSE_WRITEBACK_CACHE; +- if (se->conn.want & FUSE_CAP_POSIX_ACL) +- outarg.flags |= FUSE_POSIX_ACL; +- outarg.max_readahead = se->conn.max_readahead; +- outarg.max_write = se->conn.max_write; +- if (se->conn.proto_minor >= 13) { +- if (se->conn.max_background >= (1 << 16)) +- se->conn.max_background = (1 << 16) - 1; +- if (se->conn.congestion_threshold > se->conn.max_background) +- se->conn.congestion_threshold = se->conn.max_background; +- if (!se->conn.congestion_threshold) { +- se->conn.congestion_threshold = +- se->conn.max_background * 3 / 4; +- } +- +- outarg.max_background = se->conn.max_background; +- outarg.congestion_threshold = se->conn.congestion_threshold; +- } +- if (se->conn.proto_minor >= 23) +- outarg.time_gran = se->conn.time_gran; +- +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); +- fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); +- fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", +- outarg.max_readahead); +- fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); +- fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", +- outarg.max_background); +- fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", +- outarg.congestion_threshold); +- fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", +- outarg.time_gran); +- } +- if (arg->minor < 5) +- outargsize = FUSE_COMPAT_INIT_OUT_SIZE; +- else if (arg->minor < 23) +- outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; +- +- send_reply_ok(req, &outarg, outargsize); ++ } ++ if (se->conn.proto_minor >= 18) { ++ se->conn.capable |= FUSE_CAP_IOCTL_DIR; ++ } ++ ++ /* ++ * Default settings for modern filesystems. ++ * ++ * Most of these capabilities were disabled by default in ++ * libfuse2 for backwards compatibility reasons. In libfuse3, ++ * we can finally enable them by default (as long as they're ++ * supported by the kernel). ++ */ ++#define LL_SET_DEFAULT(cond, cap) \ ++ if ((cond) && (se->conn.capable & (cap))) \ ++ se->conn.want |= (cap) ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); ++ LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); ++ LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); ++ LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); ++ LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); ++ LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); ++ LL_SET_DEFAULT(se->op.getlk && se->op.setlk, FUSE_CAP_POSIX_LOCKS); ++ LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); ++ LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, ++ FUSE_CAP_READDIRPLUS_AUTO); ++ se->conn.time_gran = 1; ++ ++ if (bufsize < FUSE_MIN_READ_BUFFER) { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", ++ bufsize); ++ bufsize = FUSE_MIN_READ_BUFFER; ++ } ++ se->bufsize = bufsize; ++ ++ if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) { ++ se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; ++ } ++ ++ se->got_init = 1; ++ if (se->op.init) { ++ se->op.init(se->userdata, &se->conn); ++ } ++ ++ if (se->conn.want & (~se->conn.capable)) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: error: filesystem requested capabilities " ++ "0x%x that are not supported by kernel, aborting.\n", ++ se->conn.want & (~se->conn.capable)); ++ fuse_reply_err(req, EPROTO); ++ se->error = -EPROTO; ++ fuse_session_exit(se); ++ return; ++ } ++ ++ if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { ++ se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; ++ } ++ if (arg->flags & FUSE_MAX_PAGES) { ++ outarg.flags |= FUSE_MAX_PAGES; ++ outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; ++ } ++ ++ /* ++ * Always enable big writes, this is superseded ++ * by the max_write option ++ */ ++ outarg.flags |= FUSE_BIG_WRITES; ++ ++ if (se->conn.want & FUSE_CAP_ASYNC_READ) { ++ outarg.flags |= FUSE_ASYNC_READ; ++ } ++ if (se->conn.want & FUSE_CAP_POSIX_LOCKS) { ++ outarg.flags |= FUSE_POSIX_LOCKS; ++ } ++ if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) { ++ outarg.flags |= FUSE_ATOMIC_O_TRUNC; ++ } ++ if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) { ++ outarg.flags |= FUSE_EXPORT_SUPPORT; ++ } ++ if (se->conn.want & FUSE_CAP_DONT_MASK) { ++ outarg.flags |= FUSE_DONT_MASK; ++ } ++ if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) { ++ outarg.flags |= FUSE_FLOCK_LOCKS; ++ } ++ if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) { ++ outarg.flags |= FUSE_AUTO_INVAL_DATA; ++ } ++ if (se->conn.want & FUSE_CAP_READDIRPLUS) { ++ outarg.flags |= FUSE_DO_READDIRPLUS; ++ } ++ if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) { ++ outarg.flags |= FUSE_READDIRPLUS_AUTO; ++ } ++ if (se->conn.want & FUSE_CAP_ASYNC_DIO) { ++ outarg.flags |= FUSE_ASYNC_DIO; ++ } ++ if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) { ++ outarg.flags |= FUSE_WRITEBACK_CACHE; ++ } ++ if (se->conn.want & FUSE_CAP_POSIX_ACL) { ++ outarg.flags |= FUSE_POSIX_ACL; ++ } ++ outarg.max_readahead = se->conn.max_readahead; ++ outarg.max_write = se->conn.max_write; ++ if (se->conn.proto_minor >= 13) { ++ if (se->conn.max_background >= (1 << 16)) { ++ se->conn.max_background = (1 << 16) - 1; ++ } ++ if (se->conn.congestion_threshold > se->conn.max_background) { ++ se->conn.congestion_threshold = se->conn.max_background; ++ } ++ if (!se->conn.congestion_threshold) { ++ se->conn.congestion_threshold = se->conn.max_background * 3 / 4; ++ } ++ ++ outarg.max_background = se->conn.max_background; ++ outarg.congestion_threshold = se->conn.congestion_threshold; ++ } ++ if (se->conn.proto_minor >= 23) { ++ outarg.time_gran = se->conn.time_gran; ++ } ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, ++ outarg.minor); ++ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); ++ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", ++ outarg.max_readahead); ++ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); ++ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", ++ outarg.max_background); ++ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", ++ outarg.congestion_threshold); ++ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); ++ } ++ if (arg->minor < 5) { ++ outargsize = FUSE_COMPAT_INIT_OUT_SIZE; ++ } else if (arg->minor < 23) { ++ outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; ++ } ++ ++ send_reply_ok(req, &outarg, outargsize); + } + + static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_session *se = req->se; ++ struct fuse_session *se = req->se; + +- (void) nodeid; +- (void) inarg; ++ (void)nodeid; ++ (void)inarg; + +- se->got_destroy = 1; +- if (se->op.destroy) +- se->op.destroy(se->userdata); ++ se->got_destroy = 1; ++ if (se->op.destroy) { ++ se->op.destroy(se->userdata); ++ } + +- send_reply_ok(req, NULL, 0); ++ send_reply_ok(req, NULL, 0); + } + + static void list_del_nreq(struct fuse_notify_req *nreq) + { +- struct fuse_notify_req *prev = nreq->prev; +- struct fuse_notify_req *next = nreq->next; +- prev->next = next; +- next->prev = prev; ++ struct fuse_notify_req *prev = nreq->prev; ++ struct fuse_notify_req *next = nreq->next; ++ prev->next = next; ++ next->prev = prev; + } + + static void list_add_nreq(struct fuse_notify_req *nreq, +- struct fuse_notify_req *next) ++ struct fuse_notify_req *next) + { +- struct fuse_notify_req *prev = next->prev; +- nreq->next = next; +- nreq->prev = prev; +- prev->next = nreq; +- next->prev = nreq; ++ struct fuse_notify_req *prev = next->prev; ++ nreq->next = next; ++ nreq->prev = prev; ++ prev->next = nreq; ++ next->prev = nreq; + } + + static void list_init_nreq(struct fuse_notify_req *nreq) + { +- nreq->next = nreq; +- nreq->prev = nreq; ++ nreq->next = nreq; ++ nreq->prev = nreq; + } + + static void do_notify_reply(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, const struct fuse_buf *buf) ++ const void *inarg, const struct fuse_buf *buf) + { +- struct fuse_session *se = req->se; +- struct fuse_notify_req *nreq; +- struct fuse_notify_req *head; ++ struct fuse_session *se = req->se; ++ struct fuse_notify_req *nreq; ++ struct fuse_notify_req *head; + +- pthread_mutex_lock(&se->lock); +- head = &se->notify_list; +- for (nreq = head->next; nreq != head; nreq = nreq->next) { +- if (nreq->unique == req->unique) { +- list_del_nreq(nreq); +- break; +- } +- } +- pthread_mutex_unlock(&se->lock); ++ pthread_mutex_lock(&se->lock); ++ head = &se->notify_list; ++ for (nreq = head->next; nreq != head; nreq = nreq->next) { ++ if (nreq->unique == req->unique) { ++ list_del_nreq(nreq); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&se->lock); + +- if (nreq != head) +- nreq->reply(nreq, req, nodeid, inarg, buf); ++ if (nreq != head) { ++ nreq->reply(nreq, req, nodeid, inarg, buf); ++ } + } + + static int send_notify_iov(struct fuse_session *se, int notify_code, +- struct iovec *iov, int count) ++ struct iovec *iov, int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out; + +- if (!se->got_init) +- return -ENOTCONN; ++ if (!se->got_init) { ++ return -ENOTCONN; ++ } + +- out.unique = 0; +- out.error = notify_code; +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(struct fuse_out_header); ++ out.unique = 0; ++ out.error = notify_code; ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); + +- return fuse_send_msg(se, NULL, iov, count); ++ return fuse_send_msg(se, NULL, iov, count); + } + + int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) + { +- if (ph != NULL) { +- struct fuse_notify_poll_wakeup_out outarg; +- struct iovec iov[2]; ++ if (ph != NULL) { ++ struct fuse_notify_poll_wakeup_out outarg; ++ struct iovec iov[2]; + +- outarg.kh = ph->kh; ++ outarg.kh = ph->kh; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); +- } else { +- return 0; +- } ++ return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); ++ } else { ++ return 0; ++ } + } + + int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, +- off_t off, off_t len) ++ off_t off, off_t len) + { +- struct fuse_notify_inval_inode_out outarg; +- struct iovec iov[2]; ++ struct fuse_notify_inval_inode_out outarg; ++ struct iovec iov[2]; ++ ++ if (!se) { ++ return -EINVAL; ++ } + +- if (!se) +- return -EINVAL; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { ++ return -ENOSYS; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) +- return -ENOSYS; +- +- outarg.ino = ino; +- outarg.off = off; +- outarg.len = len; ++ outarg.ino = ino; ++ outarg.off = off; ++ outarg.len = len; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); + } + + int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, +- const char *name, size_t namelen) ++ const char *name, size_t namelen) + { +- struct fuse_notify_inval_entry_out outarg; +- struct iovec iov[3]; ++ struct fuse_notify_inval_entry_out outarg; ++ struct iovec iov[3]; ++ ++ if (!se) { ++ return -EINVAL; ++ } + +- if (!se) +- return -EINVAL; +- +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { ++ return -ENOSYS; ++ } + +- outarg.parent = parent; +- outarg.namelen = namelen; +- outarg.padding = 0; ++ outarg.parent = parent; ++ outarg.namelen = namelen; ++ outarg.padding = 0; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); +- iov[2].iov_base = (void *)name; +- iov[2].iov_len = namelen + 1; ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; + +- return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); + } + +-int fuse_lowlevel_notify_delete(struct fuse_session *se, +- fuse_ino_t parent, fuse_ino_t child, +- const char *name, size_t namelen) ++int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, ++ fuse_ino_t child, const char *name, ++ size_t namelen) + { +- struct fuse_notify_delete_out outarg; +- struct iovec iov[3]; ++ struct fuse_notify_delete_out outarg; ++ struct iovec iov[3]; + +- if (!se) +- return -EINVAL; ++ if (!se) { ++ return -EINVAL; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) { ++ return -ENOSYS; ++ } + +- outarg.parent = parent; +- outarg.child = child; +- outarg.namelen = namelen; +- outarg.padding = 0; ++ outarg.parent = parent; ++ outarg.child = child; ++ outarg.namelen = namelen; ++ outarg.padding = 0; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); +- iov[2].iov_base = (void *)name; +- iov[2].iov_len = namelen + 1; ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; + +- return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); ++ return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); + } + + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags) + { +- struct fuse_out_header out; +- struct fuse_notify_store_out outarg; +- struct iovec iov[3]; +- size_t size = fuse_buf_size(bufv); +- int res; ++ struct fuse_out_header out; ++ struct fuse_notify_store_out outarg; ++ struct iovec iov[3]; ++ size_t size = fuse_buf_size(bufv); ++ int res; + +- if (!se) +- return -EINVAL; ++ if (!se) { ++ return -EINVAL; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { ++ return -ENOSYS; ++ } + +- out.unique = 0; +- out.error = FUSE_NOTIFY_STORE; ++ out.unique = 0; ++ out.error = FUSE_NOTIFY_STORE; + +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; + +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(out); +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(out); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); +- if (res > 0) +- res = -res; ++ res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); ++ if (res > 0) { ++ res = -res; ++ } + +- return res; ++ return res; + } + + struct fuse_retrieve_req { +- struct fuse_notify_req nreq; +- void *cookie; ++ struct fuse_notify_req nreq; ++ void *cookie; + }; + +-static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, +- fuse_req_t req, fuse_ino_t ino, +- const void *inarg, +- const struct fuse_buf *ibuf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_retrieve_req *rreq = +- container_of(nreq, struct fuse_retrieve_req, nreq); +- const struct fuse_notify_retrieve_in *arg = inarg; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, +- .count = 1, +- }; +- +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) +- bufv.buf[0].mem = PARAM(arg); +- +- bufv.buf[0].size -= sizeof(struct fuse_in_header) + +- sizeof(struct fuse_notify_retrieve_in); +- +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); +- fuse_reply_none(req); +- goto out; +- } +- bufv.buf[0].size = arg->size; +- +- if (se->op.retrieve_reply) { +- se->op.retrieve_reply(req, rreq->cookie, ino, +- arg->offset, &bufv); +- } else { +- fuse_reply_none(req); +- } ++static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, fuse_req_t req, ++ fuse_ino_t ino, const void *inarg, ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_retrieve_req *rreq = ++ container_of(nreq, struct fuse_retrieve_req, nreq); ++ const struct fuse_notify_retrieve_in *arg = inarg; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ bufv.buf[0].mem = PARAM(arg); ++ } ++ ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_notify_retrieve_in); ++ ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); ++ fuse_reply_none(req); ++ goto out; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ if (se->op.retrieve_reply) { ++ se->op.retrieve_reply(req, rreq->cookie, ino, arg->offset, &bufv); ++ } else { ++ fuse_reply_none(req); ++ } + out: +- free(rreq); ++ free(rreq); + } + + int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie) ++ size_t size, off_t offset, void *cookie) + { +- struct fuse_notify_retrieve_out outarg; +- struct iovec iov[2]; +- struct fuse_retrieve_req *rreq; +- int err; ++ struct fuse_notify_retrieve_out outarg; ++ struct iovec iov[2]; ++ struct fuse_retrieve_req *rreq; ++ int err; + +- if (!se) +- return -EINVAL; ++ if (!se) { ++ return -EINVAL; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { ++ return -ENOSYS; ++ } + +- rreq = malloc(sizeof(*rreq)); +- if (rreq == NULL) +- return -ENOMEM; ++ rreq = malloc(sizeof(*rreq)); ++ if (rreq == NULL) { ++ return -ENOMEM; ++ } + +- pthread_mutex_lock(&se->lock); +- rreq->cookie = cookie; +- rreq->nreq.unique = se->notify_ctr++; +- rreq->nreq.reply = fuse_ll_retrieve_reply; +- list_add_nreq(&rreq->nreq, &se->notify_list); +- pthread_mutex_unlock(&se->lock); ++ pthread_mutex_lock(&se->lock); ++ rreq->cookie = cookie; ++ rreq->nreq.unique = se->notify_ctr++; ++ rreq->nreq.reply = fuse_ll_retrieve_reply; ++ list_add_nreq(&rreq->nreq, &se->notify_list); ++ pthread_mutex_unlock(&se->lock); + +- outarg.notify_unique = rreq->nreq.unique; +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; ++ outarg.notify_unique = rreq->nreq.unique; ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); +- if (err) { +- pthread_mutex_lock(&se->lock); +- list_del_nreq(&rreq->nreq); +- pthread_mutex_unlock(&se->lock); +- free(rreq); +- } ++ err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); ++ if (err) { ++ pthread_mutex_lock(&se->lock); ++ list_del_nreq(&rreq->nreq); ++ pthread_mutex_unlock(&se->lock); ++ free(rreq); ++ } + +- return err; ++ return err; + } + + void *fuse_req_userdata(fuse_req_t req) + { +- return req->se->userdata; ++ return req->se->userdata; + } + + const struct fuse_ctx *fuse_req_ctx(fuse_req_t req) + { +- return &req->ctx; ++ return &req->ctx; + } + + void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, +- void *data) ++ void *data) + { +- pthread_mutex_lock(&req->lock); +- pthread_mutex_lock(&req->se->lock); +- req->u.ni.func = func; +- req->u.ni.data = data; +- pthread_mutex_unlock(&req->se->lock); +- if (req->interrupted && func) +- func(req, data); +- pthread_mutex_unlock(&req->lock); ++ pthread_mutex_lock(&req->lock); ++ pthread_mutex_lock(&req->se->lock); ++ req->u.ni.func = func; ++ req->u.ni.data = data; ++ pthread_mutex_unlock(&req->se->lock); ++ if (req->interrupted && func) { ++ func(req, data); ++ } ++ pthread_mutex_unlock(&req->lock); + } + + int fuse_req_interrupted(fuse_req_t req) + { +- int interrupted; ++ int interrupted; + +- pthread_mutex_lock(&req->se->lock); +- interrupted = req->interrupted; +- pthread_mutex_unlock(&req->se->lock); ++ pthread_mutex_lock(&req->se->lock); ++ interrupted = req->interrupted; ++ pthread_mutex_unlock(&req->se->lock); + +- return interrupted; ++ return interrupted; + } + + static struct { +- void (*func)(fuse_req_t, fuse_ino_t, const void *); +- const char *name; ++ void (*func)(fuse_req_t, fuse_ino_t, const void *); ++ const char *name; + } fuse_ll_ops[] = { +- [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, +- [FUSE_FORGET] = { do_forget, "FORGET" }, +- [FUSE_GETATTR] = { do_getattr, "GETATTR" }, +- [FUSE_SETATTR] = { do_setattr, "SETATTR" }, +- [FUSE_READLINK] = { do_readlink, "READLINK" }, +- [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, +- [FUSE_MKNOD] = { do_mknod, "MKNOD" }, +- [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, +- [FUSE_UNLINK] = { do_unlink, "UNLINK" }, +- [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, +- [FUSE_RENAME] = { do_rename, "RENAME" }, +- [FUSE_LINK] = { do_link, "LINK" }, +- [FUSE_OPEN] = { do_open, "OPEN" }, +- [FUSE_READ] = { do_read, "READ" }, +- [FUSE_WRITE] = { do_write, "WRITE" }, +- [FUSE_STATFS] = { do_statfs, "STATFS" }, +- [FUSE_RELEASE] = { do_release, "RELEASE" }, +- [FUSE_FSYNC] = { do_fsync, "FSYNC" }, +- [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, +- [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, +- [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, +- [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, +- [FUSE_FLUSH] = { do_flush, "FLUSH" }, +- [FUSE_INIT] = { do_init, "INIT" }, +- [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, +- [FUSE_READDIR] = { do_readdir, "READDIR" }, +- [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, +- [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, +- [FUSE_GETLK] = { do_getlk, "GETLK" }, +- [FUSE_SETLK] = { do_setlk, "SETLK" }, +- [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, +- [FUSE_ACCESS] = { do_access, "ACCESS" }, +- [FUSE_CREATE] = { do_create, "CREATE" }, +- [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, +- [FUSE_BMAP] = { do_bmap, "BMAP" }, +- [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, +- [FUSE_POLL] = { do_poll, "POLL" }, +- [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, +- [FUSE_DESTROY] = { do_destroy, "DESTROY" }, +- [FUSE_NOTIFY_REPLY] = { (void *) 1, "NOTIFY_REPLY" }, +- [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, +- [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS"}, +- [FUSE_RENAME2] = { do_rename2, "RENAME2" }, +- [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, +- [FUSE_LSEEK] = { do_lseek, "LSEEK" }, ++ [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, ++ [FUSE_FORGET] = { do_forget, "FORGET" }, ++ [FUSE_GETATTR] = { do_getattr, "GETATTR" }, ++ [FUSE_SETATTR] = { do_setattr, "SETATTR" }, ++ [FUSE_READLINK] = { do_readlink, "READLINK" }, ++ [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, ++ [FUSE_MKNOD] = { do_mknod, "MKNOD" }, ++ [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, ++ [FUSE_UNLINK] = { do_unlink, "UNLINK" }, ++ [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, ++ [FUSE_RENAME] = { do_rename, "RENAME" }, ++ [FUSE_LINK] = { do_link, "LINK" }, ++ [FUSE_OPEN] = { do_open, "OPEN" }, ++ [FUSE_READ] = { do_read, "READ" }, ++ [FUSE_WRITE] = { do_write, "WRITE" }, ++ [FUSE_STATFS] = { do_statfs, "STATFS" }, ++ [FUSE_RELEASE] = { do_release, "RELEASE" }, ++ [FUSE_FSYNC] = { do_fsync, "FSYNC" }, ++ [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, ++ [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, ++ [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, ++ [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, ++ [FUSE_FLUSH] = { do_flush, "FLUSH" }, ++ [FUSE_INIT] = { do_init, "INIT" }, ++ [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, ++ [FUSE_READDIR] = { do_readdir, "READDIR" }, ++ [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, ++ [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, ++ [FUSE_GETLK] = { do_getlk, "GETLK" }, ++ [FUSE_SETLK] = { do_setlk, "SETLK" }, ++ [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, ++ [FUSE_ACCESS] = { do_access, "ACCESS" }, ++ [FUSE_CREATE] = { do_create, "CREATE" }, ++ [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, ++ [FUSE_BMAP] = { do_bmap, "BMAP" }, ++ [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, ++ [FUSE_POLL] = { do_poll, "POLL" }, ++ [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, ++ [FUSE_DESTROY] = { do_destroy, "DESTROY" }, ++ [FUSE_NOTIFY_REPLY] = { (void *)1, "NOTIFY_REPLY" }, ++ [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, ++ [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" }, ++ [FUSE_RENAME2] = { do_rename2, "RENAME2" }, ++ [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, ++ [FUSE_LSEEK] = { do_lseek, "LSEEK" }, + }; + + #define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) + + static const char *opname(enum fuse_opcode opcode) + { +- if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) +- return "???"; +- else +- return fuse_ll_ops[opcode].name; ++ if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) { ++ return "???"; ++ } else { ++ return fuse_ll_ops[opcode].name; ++ } + } + + void fuse_session_process_buf(struct fuse_session *se, +- const struct fuse_buf *buf) ++ const struct fuse_buf *buf) + { +- fuse_session_process_buf_int(se, buf, NULL); ++ fuse_session_process_buf_int(se, buf, NULL); + } + + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, struct fuse_chan *ch) +-{ +- struct fuse_in_header *in; +- const void *inarg; +- struct fuse_req *req; +- int err; +- +- in = buf->mem; +- +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, +- "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", +- (unsigned long long) in->unique, +- opname((enum fuse_opcode) in->opcode), in->opcode, +- (unsigned long long) in->nodeid, buf->size, in->pid); +- } +- +- req = fuse_ll_alloc_req(se); +- if (req == NULL) { +- struct fuse_out_header out = { +- .unique = in->unique, +- .error = -ENOMEM, +- }; +- struct iovec iov = { +- .iov_base = &out, +- .iov_len = sizeof(struct fuse_out_header), +- }; +- +- fuse_send_msg(se, ch, &iov, 1); +- return; +- } +- +- req->unique = in->unique; +- req->ctx.uid = in->uid; +- req->ctx.gid = in->gid; +- req->ctx.pid = in->pid; +- req->ch = ch; +- +- err = EIO; +- if (!se->got_init) { +- enum fuse_opcode expected; +- +- expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; +- if (in->opcode != expected) +- goto reply_err; +- } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) +- goto reply_err; +- +- err = EACCES; +- /* Implement -o allow_root */ +- if (se->deny_others && in->uid != se->owner && in->uid != 0 && +- in->opcode != FUSE_INIT && in->opcode != FUSE_READ && +- in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && +- in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && +- in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && +- in->opcode != FUSE_NOTIFY_REPLY && +- in->opcode != FUSE_READDIRPLUS) +- goto reply_err; +- +- err = ENOSYS; +- if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) +- goto reply_err; +- if (in->opcode != FUSE_INTERRUPT) { +- struct fuse_req *intr; +- pthread_mutex_lock(&se->lock); +- intr = check_interrupt(se, req); +- list_add_req(req, &se->list); +- pthread_mutex_unlock(&se->lock); +- if (intr) +- fuse_reply_err(intr, EAGAIN); +- } +- +- inarg = (void *) &in[1]; +- if (in->opcode == FUSE_WRITE && se->op.write_buf) +- do_write_buf(req, in->nodeid, inarg, buf); +- else if (in->opcode == FUSE_NOTIFY_REPLY) +- do_notify_reply(req, in->nodeid, inarg, buf); +- else +- fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); +- +- return; ++ const struct fuse_buf *buf, ++ struct fuse_chan *ch) ++{ ++ struct fuse_in_header *in; ++ const void *inarg; ++ struct fuse_req *req; ++ int err; ++ ++ in = buf->mem; ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, " ++ "pid: %u\n", ++ (unsigned long long)in->unique, ++ opname((enum fuse_opcode)in->opcode), in->opcode, ++ (unsigned long long)in->nodeid, buf->size, in->pid); ++ } ++ ++ req = fuse_ll_alloc_req(se); ++ if (req == NULL) { ++ struct fuse_out_header out = { ++ .unique = in->unique, ++ .error = -ENOMEM, ++ }; ++ struct iovec iov = { ++ .iov_base = &out, ++ .iov_len = sizeof(struct fuse_out_header), ++ }; ++ ++ fuse_send_msg(se, ch, &iov, 1); ++ return; ++ } ++ ++ req->unique = in->unique; ++ req->ctx.uid = in->uid; ++ req->ctx.gid = in->gid; ++ req->ctx.pid = in->pid; ++ req->ch = ch; ++ ++ err = EIO; ++ if (!se->got_init) { ++ enum fuse_opcode expected; ++ ++ expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; ++ if (in->opcode != expected) { ++ goto reply_err; ++ } ++ } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) { ++ goto reply_err; ++ } ++ ++ err = EACCES; ++ /* Implement -o allow_root */ ++ if (se->deny_others && in->uid != se->owner && in->uid != 0 && ++ in->opcode != FUSE_INIT && in->opcode != FUSE_READ && ++ in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && ++ in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && ++ in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && ++ in->opcode != FUSE_NOTIFY_REPLY && in->opcode != FUSE_READDIRPLUS) { ++ goto reply_err; ++ } ++ ++ err = ENOSYS; ++ if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) { ++ goto reply_err; ++ } ++ if (in->opcode != FUSE_INTERRUPT) { ++ struct fuse_req *intr; ++ pthread_mutex_lock(&se->lock); ++ intr = check_interrupt(se, req); ++ list_add_req(req, &se->list); ++ pthread_mutex_unlock(&se->lock); ++ if (intr) { ++ fuse_reply_err(intr, EAGAIN); ++ } ++ } ++ ++ inarg = (void *)&in[1]; ++ if (in->opcode == FUSE_WRITE && se->op.write_buf) { ++ do_write_buf(req, in->nodeid, inarg, buf); ++ } else if (in->opcode == FUSE_NOTIFY_REPLY) { ++ do_notify_reply(req, in->nodeid, inarg, buf); ++ } else { ++ fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); ++ } ++ ++ return; + + reply_err: +- fuse_reply_err(req, err); ++ fuse_reply_err(req, err); + } + +-#define LL_OPTION(n,o,v) \ +- { n, offsetof(struct fuse_session, o), v } ++#define LL_OPTION(n, o, v) \ ++ { \ ++ n, offsetof(struct fuse_session, o), v \ ++ } + + static const struct fuse_opt fuse_ll_opts[] = { +- LL_OPTION("debug", debug, 1), +- LL_OPTION("-d", debug, 1), +- LL_OPTION("--debug", debug, 1), +- LL_OPTION("allow_root", deny_others, 1), +- FUSE_OPT_END ++ LL_OPTION("debug", debug, 1), LL_OPTION("-d", debug, 1), ++ LL_OPTION("--debug", debug, 1), LL_OPTION("allow_root", deny_others, 1), ++ FUSE_OPT_END + }; + + void fuse_lowlevel_version(void) + { +- printf("using FUSE kernel interface version %i.%i\n", +- FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); ++ printf("using FUSE kernel interface version %i.%i\n", FUSE_KERNEL_VERSION, ++ FUSE_KERNEL_MINOR_VERSION); + } + + void fuse_lowlevel_help(void) + { +- /* These are not all options, but the ones that are +- potentially of interest to an end-user */ +- printf( +-" -o allow_root allow access by root\n" +-); ++ /* ++ * These are not all options, but the ones that are ++ * potentially of interest to an end-user ++ */ ++ printf(" -o allow_root allow access by root\n"); + } + + void fuse_session_destroy(struct fuse_session *se) + { +- if (se->got_init && !se->got_destroy) { +- if (se->op.destroy) +- se->op.destroy(se->userdata); +- } +- pthread_mutex_destroy(&se->lock); +- free(se->cuse_data); +- if (se->fd != -1) +- close(se->fd); +- free(se); ++ if (se->got_init && !se->got_destroy) { ++ if (se->op.destroy) { ++ se->op.destroy(se->userdata); ++ } ++ } ++ pthread_mutex_destroy(&se->lock); ++ free(se->cuse_data); ++ if (se->fd != -1) { ++ close(se->fd); ++ } ++ free(se); + } + + + struct fuse_session *fuse_session_new(struct fuse_args *args, +- const struct fuse_lowlevel_ops *op, +- size_t op_size, void *userdata) +-{ +- struct fuse_session *se; +- +- if (sizeof(struct fuse_lowlevel_ops) < op_size) { +- fuse_log(FUSE_LOG_ERR, "fuse: warning: library too old, some operations may not work\n"); +- op_size = sizeof(struct fuse_lowlevel_ops); +- } +- +- if (args->argc == 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: empty argv passed to fuse_session_new().\n"); +- return NULL; +- } +- +- se = (struct fuse_session *) calloc(1, sizeof(struct fuse_session)); +- if (se == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); +- goto out1; +- } +- se->fd = -1; +- se->conn.max_write = UINT_MAX; +- se->conn.max_readahead = UINT_MAX; +- +- /* Parse options */ +- if(fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) +- goto out2; +- if(args->argc == 1 && +- args->argv[0][0] == '-') { +- fuse_log(FUSE_LOG_ERR, "fuse: warning: argv[0] looks like an option, but " +- "will be ignored\n"); +- } else if (args->argc != 1) { +- int i; +- fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); +- for(i = 1; i < args->argc-1; i++) +- fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); +- fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); +- goto out4; +- } +- +- se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + +- FUSE_BUFFER_HEADER_SIZE; +- +- list_init_req(&se->list); +- list_init_req(&se->interrupts); +- list_init_nreq(&se->notify_list); +- se->notify_ctr = 1; +- fuse_mutex_init(&se->lock); +- +- memcpy(&se->op, op, op_size); +- se->owner = getuid(); +- se->userdata = userdata; +- +- return se; ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata) ++{ ++ struct fuse_session *se; ++ ++ if (sizeof(struct fuse_lowlevel_ops) < op_size) { ++ fuse_log( ++ FUSE_LOG_ERR, ++ "fuse: warning: library too old, some operations may not work\n"); ++ op_size = sizeof(struct fuse_lowlevel_ops); ++ } ++ ++ if (args->argc == 0) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: empty argv passed to fuse_session_new().\n"); ++ return NULL; ++ } ++ ++ se = (struct fuse_session *)calloc(1, sizeof(struct fuse_session)); ++ if (se == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); ++ goto out1; ++ } ++ se->fd = -1; ++ se->conn.max_write = UINT_MAX; ++ se->conn.max_readahead = UINT_MAX; ++ ++ /* Parse options */ ++ if (fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) { ++ goto out2; ++ } ++ if (args->argc == 1 && args->argv[0][0] == '-') { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: warning: argv[0] looks like an option, but " ++ "will be ignored\n"); ++ } else if (args->argc != 1) { ++ int i; ++ fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); ++ for (i = 1; i < args->argc - 1; i++) { ++ fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); ++ } ++ fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); ++ goto out4; ++ } ++ ++ se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; ++ ++ list_init_req(&se->list); ++ list_init_req(&se->interrupts); ++ list_init_nreq(&se->notify_list); ++ se->notify_ctr = 1; ++ fuse_mutex_init(&se->lock); ++ ++ memcpy(&se->op, op, op_size); ++ se->owner = getuid(); ++ se->userdata = userdata; ++ ++ return se; + + out4: +- fuse_opt_free_args(args); ++ fuse_opt_free_args(args); + out2: +- free(se); ++ free(se); + out1: +- return NULL; ++ return NULL; + } + + int fuse_session_mount(struct fuse_session *se, const char *mountpoint) + { +- int fd; +- +- /* +- * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos +- * would ensue. +- */ +- do { +- fd = open("/dev/null", O_RDWR); +- if (fd > 2) +- close(fd); +- } while (fd >= 0 && fd <= 2); +- +- /* +- * To allow FUSE daemons to run without privileges, the caller may open +- * /dev/fuse before launching the file system and pass on the file +- * descriptor by specifying /dev/fd/N as the mount point. Note that the +- * parent process takes care of performing the mount in this case. +- */ +- fd = fuse_mnt_parse_fuse_fd(mountpoint); +- if (fd != -1) { +- if (fcntl(fd, F_GETFD) == -1) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: Invalid file descriptor /dev/fd/%u\n", +- fd); +- return -1; +- } +- se->fd = fd; +- return 0; +- } +- +- /* Open channel */ +- fd = fuse_kern_mount(mountpoint, se->mo); +- if (fd == -1) +- return -1; +- se->fd = fd; +- +- /* Save mountpoint */ +- se->mountpoint = strdup(mountpoint); +- if (se->mountpoint == NULL) +- goto error_out; +- +- return 0; ++ int fd; ++ ++ /* ++ * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos ++ * would ensue. ++ */ ++ do { ++ fd = open("/dev/null", O_RDWR); ++ if (fd > 2) { ++ close(fd); ++ } ++ } while (fd >= 0 && fd <= 2); ++ ++ /* ++ * To allow FUSE daemons to run without privileges, the caller may open ++ * /dev/fuse before launching the file system and pass on the file ++ * descriptor by specifying /dev/fd/N as the mount point. Note that the ++ * parent process takes care of performing the mount in this case. ++ */ ++ fd = fuse_mnt_parse_fuse_fd(mountpoint); ++ if (fd != -1) { ++ if (fcntl(fd, F_GETFD) == -1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: Invalid file descriptor /dev/fd/%u\n", ++ fd); ++ return -1; ++ } ++ se->fd = fd; ++ return 0; ++ } ++ ++ /* Open channel */ ++ fd = fuse_kern_mount(mountpoint, se->mo); ++ if (fd == -1) { ++ return -1; ++ } ++ se->fd = fd; ++ ++ /* Save mountpoint */ ++ se->mountpoint = strdup(mountpoint); ++ if (se->mountpoint == NULL) { ++ goto error_out; ++ } ++ ++ return 0; + + error_out: +- fuse_kern_unmount(mountpoint, fd); +- return -1; ++ fuse_kern_unmount(mountpoint, fd); ++ return -1; + } + + int fuse_session_fd(struct fuse_session *se) + { +- return se->fd; ++ return se->fd; + } + + void fuse_session_unmount(struct fuse_session *se) +@@ -2384,61 +2519,66 @@ void fuse_session_unmount(struct fuse_session *se) + #ifdef linux + int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) + { +- char *buf; +- size_t bufsize = 1024; +- char path[128]; +- int ret; +- int fd; +- unsigned long pid = req->ctx.pid; +- char *s; ++ char *buf; ++ size_t bufsize = 1024; ++ char path[128]; ++ int ret; ++ int fd; ++ unsigned long pid = req->ctx.pid; ++ char *s; + +- sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); ++ sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); + + retry: +- buf = malloc(bufsize); +- if (buf == NULL) +- return -ENOMEM; +- +- ret = -EIO; +- fd = open(path, O_RDONLY); +- if (fd == -1) +- goto out_free; +- +- ret = read(fd, buf, bufsize); +- close(fd); +- if (ret < 0) { +- ret = -EIO; +- goto out_free; +- } +- +- if ((size_t)ret == bufsize) { +- free(buf); +- bufsize *= 4; +- goto retry; +- } +- +- ret = -EIO; +- s = strstr(buf, "\nGroups:"); +- if (s == NULL) +- goto out_free; +- +- s += 8; +- ret = 0; +- while (1) { +- char *end; +- unsigned long val = strtoul(s, &end, 0); +- if (end == s) +- break; +- +- s = end; +- if (ret < size) +- list[ret] = val; +- ret++; +- } ++ buf = malloc(bufsize); ++ if (buf == NULL) { ++ return -ENOMEM; ++ } ++ ++ ret = -EIO; ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ goto out_free; ++ } ++ ++ ret = read(fd, buf, bufsize); ++ close(fd); ++ if (ret < 0) { ++ ret = -EIO; ++ goto out_free; ++ } ++ ++ if ((size_t)ret == bufsize) { ++ free(buf); ++ bufsize *= 4; ++ goto retry; ++ } ++ ++ ret = -EIO; ++ s = strstr(buf, "\nGroups:"); ++ if (s == NULL) { ++ goto out_free; ++ } ++ ++ s += 8; ++ ret = 0; ++ while (1) { ++ char *end; ++ unsigned long val = strtoul(s, &end, 0); ++ if (end == s) { ++ break; ++ } ++ ++ s = end; ++ if (ret < size) { ++ list[ret] = val; ++ } ++ ret++; ++ } + + out_free: +- free(buf); +- return ret; ++ free(buf); ++ return ret; + } + #else /* linux */ + /* +@@ -2446,23 +2586,25 @@ out_free: + */ + int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) + { +- (void) req; (void) size; (void) list; +- return -ENOSYS; ++ (void)req; ++ (void)size; ++ (void)list; ++ return -ENOSYS; + } + #endif + + void fuse_session_exit(struct fuse_session *se) + { +- se->exited = 1; ++ se->exited = 1; + } + + void fuse_session_reset(struct fuse_session *se) + { +- se->exited = 0; +- se->error = 0; ++ se->exited = 0; ++ se->error = 0; + } + + int fuse_session_exited(struct fuse_session *se) + { +- return se->exited; ++ return se->exited; + } +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 6b1adfc..adb9054 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1,15 +1,16 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_LOWLEVEL_H_ + #define FUSE_LOWLEVEL_H_ + +-/** @file ++/** ++ * @file + * + * Low level API + * +@@ -24,16 +25,16 @@ + + #include "fuse_common.h" + +-#include + #include +-#include + #include + #include ++#include + #include ++#include + +-/* ----------------------------------------------------------- * +- * Miscellaneous definitions * +- * ----------------------------------------------------------- */ ++/* ++ * Miscellaneous definitions ++ */ + + /** The node ID of the root inode */ + #define FUSE_ROOT_ID 1 +@@ -53,47 +54,54 @@ struct fuse_session; + + /** Directory entry parameters supplied to fuse_reply_entry() */ + struct fuse_entry_param { +- /** Unique inode number +- * +- * In lookup, zero means negative entry (from version 2.5) +- * Returning ENOENT also means negative entry, but by setting zero +- * ino the kernel may cache negative entries for entry_timeout +- * seconds. +- */ +- fuse_ino_t ino; +- +- /** Generation number for this entry. +- * +- * If the file system will be exported over NFS, the +- * ino/generation pairs need to be unique over the file +- * system's lifetime (rather than just the mount time). So if +- * the file system reuses an inode after it has been deleted, +- * it must assign a new, previously unused generation number +- * to the inode at the same time. +- * +- */ +- uint64_t generation; +- +- /** Inode attributes. +- * +- * Even if attr_timeout == 0, attr must be correct. For example, +- * for open(), FUSE uses attr.st_size from lookup() to determine +- * how many bytes to request. If this value is not correct, +- * incorrect data will be returned. +- */ +- struct stat attr; +- +- /** Validity timeout (in seconds) for inode attributes. If +- attributes only change as a result of requests that come +- through the kernel, this should be set to a very large +- value. */ +- double attr_timeout; +- +- /** Validity timeout (in seconds) for the name. If directory +- entries are changed/deleted only as a result of requests +- that come through the kernel, this should be set to a very +- large value. */ +- double entry_timeout; ++ /** ++ * Unique inode number ++ * ++ * In lookup, zero means negative entry (from version 2.5) ++ * Returning ENOENT also means negative entry, but by setting zero ++ * ino the kernel may cache negative entries for entry_timeout ++ * seconds. ++ */ ++ fuse_ino_t ino; ++ ++ /** ++ * Generation number for this entry. ++ * ++ * If the file system will be exported over NFS, the ++ * ino/generation pairs need to be unique over the file ++ * system's lifetime (rather than just the mount time). So if ++ * the file system reuses an inode after it has been deleted, ++ * it must assign a new, previously unused generation number ++ * to the inode at the same time. ++ * ++ */ ++ uint64_t generation; ++ ++ /** ++ * Inode attributes. ++ * ++ * Even if attr_timeout == 0, attr must be correct. For example, ++ * for open(), FUSE uses attr.st_size from lookup() to determine ++ * how many bytes to request. If this value is not correct, ++ * incorrect data will be returned. ++ */ ++ struct stat attr; ++ ++ /** ++ * Validity timeout (in seconds) for inode attributes. If ++ * attributes only change as a result of requests that come ++ * through the kernel, this should be set to a very large ++ * value. ++ */ ++ double attr_timeout; ++ ++ /** ++ * Validity timeout (in seconds) for the name. If directory ++ * entries are changed/deleted only as a result of requests ++ * that come through the kernel, this should be set to a very ++ * large value. ++ */ ++ double entry_timeout; + }; + + /** +@@ -105,38 +113,38 @@ struct fuse_entry_param { + * there is no valid uid/pid/gid that could be reported. + */ + struct fuse_ctx { +- /** User ID of the calling process */ +- uid_t uid; ++ /** User ID of the calling process */ ++ uid_t uid; + +- /** Group ID of the calling process */ +- gid_t gid; ++ /** Group ID of the calling process */ ++ gid_t gid; + +- /** Thread ID of the calling process */ +- pid_t pid; ++ /** Thread ID of the calling process */ ++ pid_t pid; + +- /** Umask of the calling process */ +- mode_t umask; ++ /** Umask of the calling process */ ++ mode_t umask; + }; + + struct fuse_forget_data { +- fuse_ino_t ino; +- uint64_t nlookup; ++ fuse_ino_t ino; ++ uint64_t nlookup; + }; + + /* 'to_set' flags in setattr */ +-#define FUSE_SET_ATTR_MODE (1 << 0) +-#define FUSE_SET_ATTR_UID (1 << 1) +-#define FUSE_SET_ATTR_GID (1 << 2) +-#define FUSE_SET_ATTR_SIZE (1 << 3) +-#define FUSE_SET_ATTR_ATIME (1 << 4) +-#define FUSE_SET_ATTR_MTIME (1 << 5) +-#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) +-#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) +-#define FUSE_SET_ATTR_CTIME (1 << 10) +- +-/* ----------------------------------------------------------- * +- * Request methods and replies * +- * ----------------------------------------------------------- */ ++#define FUSE_SET_ATTR_MODE (1 << 0) ++#define FUSE_SET_ATTR_UID (1 << 1) ++#define FUSE_SET_ATTR_GID (1 << 2) ++#define FUSE_SET_ATTR_SIZE (1 << 3) ++#define FUSE_SET_ATTR_ATIME (1 << 4) ++#define FUSE_SET_ATTR_MTIME (1 << 5) ++#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) ++#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) ++#define FUSE_SET_ATTR_CTIME (1 << 10) ++ ++/* ++ * Request methods and replies ++ */ + + /** + * Low level filesystem operations +@@ -166,1075 +174,1069 @@ struct fuse_forget_data { + * this file will not be called. + */ + struct fuse_lowlevel_ops { +- /** +- * Initialize filesystem +- * +- * This function is called when libfuse establishes +- * communication with the FUSE kernel module. The file system +- * should use this module to inspect and/or modify the +- * connection parameters provided in the `conn` structure. +- * +- * Note that some parameters may be overwritten by options +- * passed to fuse_session_new() which take precedence over the +- * values set in this handler. +- * +- * There's no reply to this function +- * +- * @param userdata the user data passed to fuse_session_new() +- */ +- void (*init) (void *userdata, struct fuse_conn_info *conn); +- +- /** +- * Clean up filesystem. +- * +- * Called on filesystem exit. When this method is called, the +- * connection to the kernel may be gone already, so that eg. calls +- * to fuse_lowlevel_notify_* will fail. +- * +- * There's no reply to this function +- * +- * @param userdata the user data passed to fuse_session_new() +- */ +- void (*destroy) (void *userdata); +- +- /** +- * Look up a directory entry by name and get its attributes. +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name the name to look up +- */ +- void (*lookup) (fuse_req_t req, fuse_ino_t parent, const char *name); +- +- /** +- * Forget about an inode +- * +- * This function is called when the kernel removes an inode +- * from its internal caches. +- * +- * The inode's lookup count increases by one for every call to +- * fuse_reply_entry and fuse_reply_create. The nlookup parameter +- * indicates by how much the lookup count should be decreased. +- * +- * Inodes with a non-zero lookup count may receive request from +- * the kernel even after calls to unlink, rmdir or (when +- * overwriting an existing file) rename. Filesystems must handle +- * such requests properly and it is recommended to defer removal +- * of the inode until the lookup count reaches zero. Calls to +- * unlink, rmdir or rename will be followed closely by forget +- * unless the file or directory is open, in which case the +- * kernel issues forget only after the release or releasedir +- * calls. +- * +- * Note that if a file system will be exported over NFS the +- * inodes lifetime must extend even beyond forget. See the +- * generation field in struct fuse_entry_param above. +- * +- * On unmount the lookup count for all inodes implicitly drops +- * to zero. It is not guaranteed that the file system will +- * receive corresponding forget messages for the affected +- * inodes. +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- * @param ino the inode number +- * @param nlookup the number of lookups to forget +- */ +- void (*forget) (fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); +- +- /** +- * Get file attributes. +- * +- * If writeback caching is enabled, the kernel may have a +- * better idea of a file's length than the FUSE file system +- * (eg if there has been a write that extended the file size, +- * but that has not yet been passed to the filesystem.n +- * +- * In this case, the st_size value provided by the file system +- * will be ignored. +- * +- * Valid replies: +- * fuse_reply_attr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi for future use, currently always NULL +- */ +- void (*getattr) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Set file attributes +- * +- * In the 'attr' argument only members indicated by the 'to_set' +- * bitmask contain valid values. Other members contain undefined +- * values. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits if the file +- * size or owner is being changed. +- * +- * If the setattr was invoked from the ftruncate() system call +- * under Linux kernel versions 2.6.15 or later, the fi->fh will +- * contain the value set by the open method or will be undefined +- * if the open method didn't set any value. Otherwise (not +- * ftruncate call, or kernel version earlier than 2.6.15) the fi +- * parameter will be NULL. +- * +- * Valid replies: +- * fuse_reply_attr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param attr the attributes +- * @param to_set bit mask of attributes which should be set +- * @param fi file information, or NULL +- */ +- void (*setattr) (fuse_req_t req, fuse_ino_t ino, struct stat *attr, +- int to_set, struct fuse_file_info *fi); +- +- /** +- * Read symbolic link +- * +- * Valid replies: +- * fuse_reply_readlink +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- */ +- void (*readlink) (fuse_req_t req, fuse_ino_t ino); +- +- /** +- * Create file node +- * +- * Create a regular file, character device, block device, fifo or +- * socket node. +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to create +- * @param mode file type and mode with which to create the new file +- * @param rdev the device number (only valid if created file is a device) +- */ +- void (*mknod) (fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode, dev_t rdev); +- +- /** +- * Create a directory +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to create +- * @param mode with which to create the new file +- */ +- void (*mkdir) (fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode); +- +- /** +- * Remove a file +- * +- * If the file's inode's lookup count is non-zero, the file +- * system is expected to postpone any removal of the inode +- * until the lookup count reaches zero (see description of the +- * forget function). +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to remove +- */ +- void (*unlink) (fuse_req_t req, fuse_ino_t parent, const char *name); +- +- /** +- * Remove a directory +- * +- * If the directory's inode's lookup count is non-zero, the +- * file system is expected to postpone any removal of the +- * inode until the lookup count reaches zero (see description +- * of the forget function). +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to remove +- */ +- void (*rmdir) (fuse_req_t req, fuse_ino_t parent, const char *name); +- +- /** +- * Create a symbolic link +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param link the contents of the symbolic link +- * @param parent inode number of the parent directory +- * @param name to create +- */ +- void (*symlink) (fuse_req_t req, const char *link, fuse_ino_t parent, +- const char *name); +- +- /** Rename a file +- * +- * If the target exists it should be atomically replaced. If +- * the target's inode's lookup count is non-zero, the file +- * system is expected to postpone any removal of the inode +- * until the lookup count reaches zero (see description of the +- * forget function). +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EINVAL, i.e. all +- * future bmap requests will fail with EINVAL without being +- * send to the filesystem process. +- * +- * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If +- * RENAME_NOREPLACE is specified, the filesystem must not +- * overwrite *newname* if it exists and return an error +- * instead. If `RENAME_EXCHANGE` is specified, the filesystem +- * must atomically exchange the two files, i.e. both must +- * exist and neither may be deleted. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the old parent directory +- * @param name old name +- * @param newparent inode number of the new parent directory +- * @param newname new name +- */ +- void (*rename) (fuse_req_t req, fuse_ino_t parent, const char *name, +- fuse_ino_t newparent, const char *newname, +- unsigned int flags); +- +- /** +- * Create a hard link +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the old inode number +- * @param newparent inode number of the new parent directory +- * @param newname new name to create +- */ +- void (*link) (fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, +- const char *newname); +- +- /** +- * Open a file +- * +- * Open flags are available in fi->flags. The following rules +- * apply. +- * +- * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be +- * filtered out / handled by the kernel. +- * +- * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used +- * by the filesystem to check if the operation is +- * permitted. If the ``-o default_permissions`` mount +- * option is given, this check is already done by the +- * kernel before calling open() and may thus be omitted by +- * the filesystem. +- * +- * - When writeback caching is enabled, the kernel may send +- * read requests even for files opened with O_WRONLY. The +- * filesystem should be prepared to handle this. +- * +- * - When writeback caching is disabled, the filesystem is +- * expected to properly handle the O_APPEND flag and ensure +- * that each write is appending to the end of the file. +- * +- * - When writeback caching is enabled, the kernel will +- * handle O_APPEND. However, unless all changes to the file +- * come through the kernel this will not work reliably. The +- * filesystem should thus either ignore the O_APPEND flag +- * (and let the kernel handle it), or return an error +- * (indicating that reliably O_APPEND is not available). +- * +- * Filesystem may store an arbitrary file handle (pointer, +- * index, etc) in fi->fh, and use this in other all other file +- * operations (read, write, flush, release, fsync). +- * +- * Filesystem may also implement stateless file I/O and not store +- * anything in fi->fh. +- * +- * There are also some flags (direct_io, keep_cache) which the +- * filesystem may set in fi, to change the way the file is opened. +- * See fuse_file_info structure in for more details. +- * +- * If this request is answered with an error code of ENOSYS +- * and FUSE_CAP_NO_OPEN_SUPPORT is set in +- * `fuse_conn_info.capable`, this is treated as success and +- * future calls to open and release will also succeed without being +- * sent to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_open +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*open) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Read data +- * +- * Read should send exactly the number of bytes requested except +- * on EOF or error, otherwise the rest of the data will be +- * substituted with zeroes. An exception to this is when the file +- * has been opened in 'direct_io' mode, in which case the return +- * value of the read system call will reflect the return value of +- * this operation. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_iov +- * fuse_reply_data +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size number of bytes to read +- * @param off offset to read from +- * @param fi file information +- */ +- void (*read) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Write data +- * +- * Write should return exactly the number of bytes requested +- * except on error. An exception to this is when the file has +- * been opened in 'direct_io' mode, in which case the return value +- * of the write system call will reflect the return value of this +- * operation. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * +- * Valid replies: +- * fuse_reply_write +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param buf data to write +- * @param size number of bytes to write +- * @param off offset to write to +- * @param fi file information +- */ +- void (*write) (fuse_req_t req, fuse_ino_t ino, const char *buf, +- size_t size, off_t off, struct fuse_file_info *fi); +- +- /** +- * Flush method +- * +- * This is called on each close() of the opened file. +- * +- * Since file descriptors can be duplicated (dup, dup2, fork), for +- * one open call there may be many flush calls. +- * +- * Filesystems shouldn't assume that flush will always be called +- * after some writes, or that if will be called at all. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * +- * NOTE: the name of the method is misleading, since (unlike +- * fsync) the filesystem is not forced to flush pending writes. +- * One reason to flush data is if the filesystem wants to return +- * write errors during close. However, such use is non-portable +- * because POSIX does not require [close] to wait for delayed I/O to +- * complete. +- * +- * If the filesystem supports file locking operations (setlk, +- * getlk) it should remove all locks belonging to 'fi->owner'. +- * +- * If this request is answered with an error code of ENOSYS, +- * this is treated as success and future calls to flush() will +- * succeed automatically without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * +- * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html +- */ +- void (*flush) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Release an open file +- * +- * Release is called when there are no more references to an open +- * file: all file descriptors are closed and all memory mappings +- * are unmapped. +- * +- * For every open call there will be exactly one release call (unless +- * the filesystem is force-unmounted). +- * +- * The filesystem may reply with an error, but error values are +- * not returned to close() or munmap() which triggered the +- * release. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * fi->flags will contain the same flags as for open. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*release) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Synchronize file contents +- * +- * If the datasync parameter is non-zero, then only the user data +- * should be flushed, not the meta data. +- * +- * If this request is answered with an error code of ENOSYS, +- * this is treated as success and future calls to fsync() will +- * succeed automatically without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param datasync flag indicating if only data should be flushed +- * @param fi file information +- */ +- void (*fsync) (fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi); +- +- /** +- * Open a directory +- * +- * Filesystem may store an arbitrary file handle (pointer, index, +- * etc) in fi->fh, and use this in other all other directory +- * stream operations (readdir, releasedir, fsyncdir). +- * +- * If this request is answered with an error code of ENOSYS and +- * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, +- * this is treated as success and future calls to opendir and +- * releasedir will also succeed without being sent to the filesystem +- * process. In addition, the kernel will cache readdir results +- * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. +- * +- * Valid replies: +- * fuse_reply_open +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*opendir) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Read directory +- * +- * Send a buffer filled using fuse_add_direntry(), with size not +- * exceeding the requested size. Send an empty buffer on end of +- * stream. +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * Returning a directory entry from readdir() does not affect +- * its lookup count. +- * +- * If off_t is non-zero, then it will correspond to one of the off_t +- * values that was previously returned by readdir() for the same +- * directory handle. In this case, readdir() should skip over entries +- * coming before the position defined by the off_t value. If entries +- * are added or removed while the directory handle is open, they filesystem +- * may still include the entries that have been removed, and may not +- * report the entries that have been created. However, addition or +- * removal of entries must never cause readdir() to skip over unrelated +- * entries or to report them more than once. This means +- * that off_t can not be a simple index that enumerates the entries +- * that have been returned but must contain sufficient information to +- * uniquely determine the next directory entry to return even when the +- * set of entries is changing. +- * +- * The function does not have to report the '.' and '..' +- * entries, but is allowed to do so. Note that, if readdir does +- * not return '.' or '..', they will not be implicitly returned, +- * and this behavior is observable by the caller. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size maximum number of bytes to send +- * @param off offset to continue reading the directory stream +- * @param fi file information +- */ +- void (*readdir) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Release an open directory +- * +- * For every opendir call there will be exactly one releasedir +- * call (unless the filesystem is force-unmounted). +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*releasedir) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Synchronize directory contents +- * +- * If the datasync parameter is non-zero, then only the directory +- * contents should be flushed, not the meta data. +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * If this request is answered with an error code of ENOSYS, +- * this is treated as success and future calls to fsyncdir() will +- * succeed automatically without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param datasync flag indicating if only data should be flushed +- * @param fi file information +- */ +- void (*fsyncdir) (fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi); +- +- /** +- * Get file system statistics +- * +- * Valid replies: +- * fuse_reply_statfs +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number, zero means "undefined" +- */ +- void (*statfs) (fuse_req_t req, fuse_ino_t ino); +- +- /** +- * Set an extended attribute +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future setxattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- */ +- void (*setxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, +- const char *value, size_t size, int flags); +- +- /** +- * Get an extended attribute +- * +- * If size is zero, the size of the value should be sent with +- * fuse_reply_xattr. +- * +- * If the size is non-zero, and the value fits in the buffer, the +- * value should be sent with fuse_reply_buf. +- * +- * If the size is too small for the value, the ERANGE error should +- * be sent. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future getxattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_xattr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param name of the extended attribute +- * @param size maximum size of the value to send +- */ +- void (*getxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, +- size_t size); +- +- /** +- * List extended attribute names +- * +- * If size is zero, the total size of the attribute list should be +- * sent with fuse_reply_xattr. +- * +- * If the size is non-zero, and the null character separated +- * attribute list fits in the buffer, the list should be sent with +- * fuse_reply_buf. +- * +- * If the size is too small for the list, the ERANGE error should +- * be sent. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future listxattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_xattr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size maximum size of the list to send +- */ +- void (*listxattr) (fuse_req_t req, fuse_ino_t ino, size_t size); +- +- /** +- * Remove an extended attribute +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future removexattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param name of the extended attribute +- */ +- void (*removexattr) (fuse_req_t req, fuse_ino_t ino, const char *name); +- +- /** +- * Check file access permissions +- * +- * This will be called for the access() and chdir() system +- * calls. If the 'default_permissions' mount option is given, +- * this method is not called. +- * +- * This method is not called under Linux kernel versions 2.4.x +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent success, i.e. this and all future access() +- * requests will succeed without being send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param mask requested access mode +- */ +- void (*access) (fuse_req_t req, fuse_ino_t ino, int mask); +- +- /** +- * Create and open a file +- * +- * If the file does not exist, first create it with the specified +- * mode, and then open it. +- * +- * See the description of the open handler for more +- * information. +- * +- * If this method is not implemented or under Linux kernel +- * versions earlier than 2.6.15, the mknod() and open() methods +- * will be called instead. +- * +- * If this request is answered with an error code of ENOSYS, the handler +- * is treated as not implemented (i.e., for this and future requests the +- * mknod() and open() handlers will be called instead). +- * +- * Valid replies: +- * fuse_reply_create +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to create +- * @param mode file type and mode with which to create the new file +- * @param fi file information +- */ +- void (*create) (fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode, struct fuse_file_info *fi); +- +- /** +- * Test for a POSIX file lock +- * +- * Valid replies: +- * fuse_reply_lock +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param lock the region/type to test +- */ +- void (*getlk) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi, struct flock *lock); +- +- /** +- * Acquire, modify or release a POSIX file lock +- * +- * For POSIX threads (NPTL) there's a 1-1 relation between pid and +- * owner, but otherwise this is not always the case. For checking +- * lock ownership, 'fi->owner' must be used. The l_pid field in +- * 'struct flock' should only be used to fill in this field in +- * getlk(). +- * +- * Note: if the locking methods are not implemented, the kernel +- * will still allow file locking to work locally. Hence these are +- * only interesting for network filesystems and similar. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param lock the region/type to set +- * @param sleep locking operation may sleep +- */ +- void (*setlk) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi, +- struct flock *lock, int sleep); +- +- /** +- * Map block index within file to block index within device +- * +- * Note: This makes sense only for block device backed filesystems +- * mounted with the 'blkdev' option +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure, i.e. all future bmap() requests will +- * fail with the same error code without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_bmap +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param blocksize unit of block index +- * @param idx block index within file +- */ +- void (*bmap) (fuse_req_t req, fuse_ino_t ino, size_t blocksize, +- uint64_t idx); +- +- /** +- * Ioctl +- * +- * Note: For unrestricted ioctls (not allowed for FUSE +- * servers), data in and out areas can be discovered by giving +- * iovs and setting FUSE_IOCTL_RETRY in *flags*. For +- * restricted ioctls, kernel prepares in/out data area +- * according to the information encoded in cmd. +- * +- * Valid replies: +- * fuse_reply_ioctl_retry +- * fuse_reply_ioctl +- * fuse_reply_ioctl_iov +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param cmd ioctl command +- * @param arg ioctl argument +- * @param fi file information +- * @param flags for FUSE_IOCTL_* flags +- * @param in_buf data fetched from the caller +- * @param in_bufsz number of fetched bytes +- * @param out_bufsz maximum size of output data +- * +- * Note : the unsigned long request submitted by the application +- * is truncated to 32 bits. +- */ +- void (*ioctl) (fuse_req_t req, fuse_ino_t ino, unsigned int cmd, +- void *arg, struct fuse_file_info *fi, unsigned flags, +- const void *in_buf, size_t in_bufsz, size_t out_bufsz); +- +- /** +- * Poll for IO readiness +- * +- * Note: If ph is non-NULL, the client should notify +- * when IO readiness events occur by calling +- * fuse_lowlevel_notify_poll() with the specified ph. +- * +- * Regardless of the number of times poll with a non-NULL ph +- * is received, single notification is enough to clear all. +- * Notifying more times incurs overhead but doesn't harm +- * correctness. +- * +- * The callee is responsible for destroying ph with +- * fuse_pollhandle_destroy() when no longer in use. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as success (with a kernel-defined default poll-mask) and +- * future calls to pull() will succeed the same way without being send +- * to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_poll +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param ph poll handle to be used for notification +- */ +- void (*poll) (fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, +- struct fuse_pollhandle *ph); +- +- /** +- * Write data made available in a buffer +- * +- * This is a more generic version of the ->write() method. If +- * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the +- * kernel supports splicing from the fuse device, then the +- * data will be made available in pipe for supporting zero +- * copy data transfer. +- * +- * buf->count is guaranteed to be one (and thus buf->idx is +- * always zero). The write_buf handler must ensure that +- * bufv->off is correctly updated (reflecting the number of +- * bytes read from bufv->buf[0]). +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- * +- * Valid replies: +- * fuse_reply_write +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param bufv buffer containing the data +- * @param off offset to write to +- * @param fi file information +- */ +- void (*write_buf) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_bufvec *bufv, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Callback function for the retrieve request +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() +- * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() +- * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() +- * @param bufv the buffer containing the returned data +- */ +- void (*retrieve_reply) (fuse_req_t req, void *cookie, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv); +- +- /** +- * Forget about multiple inodes +- * +- * See description of the forget function for more +- * information. +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- */ +- void (*forget_multi) (fuse_req_t req, size_t count, +- struct fuse_forget_data *forgets); +- +- /** +- * Acquire, modify or release a BSD file lock +- * +- * Note: if the locking methods are not implemented, the kernel +- * will still allow file locking to work locally. Hence these are +- * only interesting for network filesystems and similar. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param op the locking operation, see flock(2) +- */ +- void (*flock) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi, int op); +- +- /** +- * Allocate requested space. If this function returns success then +- * subsequent writes to the specified range shall not fail due to the lack +- * of free space on the file system storage media. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future fallocate() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param offset starting point for allocated region +- * @param length size of allocated region +- * @param mode determines the operation to be performed on the given range, +- * see fallocate(2) +- */ +- void (*fallocate) (fuse_req_t req, fuse_ino_t ino, int mode, +- off_t offset, off_t length, struct fuse_file_info *fi); +- +- /** +- * Read directory with attributes +- * +- * Send a buffer filled using fuse_add_direntry_plus(), with size not +- * exceeding the requested size. Send an empty buffer on end of +- * stream. +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * In contrast to readdir() (which does not affect the lookup counts), +- * the lookup count of every entry returned by readdirplus(), except "." +- * and "..", is incremented by one. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size maximum number of bytes to send +- * @param off offset to continue reading the directory stream +- * @param fi file information +- */ +- void (*readdirplus) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Copy a range of data from one file to another +- * +- * Performs an optimized copy between two file descriptors without the +- * additional cost of transferring data through the FUSE kernel module +- * to user space (glibc) and then back into the FUSE filesystem again. +- * +- * In case this method is not implemented, glibc falls back to reading +- * data from the source and writing to the destination. Effectively +- * doing an inefficient copy of the data. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future copy_file_range() requests will fail with EOPNOTSUPP without +- * being send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_write +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino_in the inode number or the source file +- * @param off_in starting point from were the data should be read +- * @param fi_in file information of the source file +- * @param ino_out the inode number or the destination file +- * @param off_out starting point where the data should be written +- * @param fi_out file information of the destination file +- * @param len maximum size of the data to copy +- * @param flags passed along with the copy_file_range() syscall +- */ +- void (*copy_file_range) (fuse_req_t req, fuse_ino_t ino_in, +- off_t off_in, struct fuse_file_info *fi_in, +- fuse_ino_t ino_out, off_t off_out, +- struct fuse_file_info *fi_out, size_t len, +- int flags); +- +- /** +- * Find next data or hole after the specified offset +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure, i.e. all future lseek() requests will +- * fail with the same error code without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_lseek +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param off offset to start search from +- * @param whence either SEEK_DATA or SEEK_HOLE +- * @param fi file information +- */ +- void (*lseek) (fuse_req_t req, fuse_ino_t ino, off_t off, int whence, +- struct fuse_file_info *fi); ++ /** ++ * Initialize filesystem ++ * ++ * This function is called when libfuse establishes ++ * communication with the FUSE kernel module. The file system ++ * should use this module to inspect and/or modify the ++ * connection parameters provided in the `conn` structure. ++ * ++ * Note that some parameters may be overwritten by options ++ * passed to fuse_session_new() which take precedence over the ++ * values set in this handler. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*init)(void *userdata, struct fuse_conn_info *conn); ++ ++ /** ++ * Clean up filesystem. ++ * ++ * Called on filesystem exit. When this method is called, the ++ * connection to the kernel may be gone already, so that eg. calls ++ * to fuse_lowlevel_notify_* will fail. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*destroy)(void *userdata); ++ ++ /** ++ * Look up a directory entry by name and get its attributes. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name the name to look up ++ */ ++ void (*lookup)(fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Forget about an inode ++ * ++ * This function is called when the kernel removes an inode ++ * from its internal caches. ++ * ++ * The inode's lookup count increases by one for every call to ++ * fuse_reply_entry and fuse_reply_create. The nlookup parameter ++ * indicates by how much the lookup count should be decreased. ++ * ++ * Inodes with a non-zero lookup count may receive request from ++ * the kernel even after calls to unlink, rmdir or (when ++ * overwriting an existing file) rename. Filesystems must handle ++ * such requests properly and it is recommended to defer removal ++ * of the inode until the lookup count reaches zero. Calls to ++ * unlink, rmdir or rename will be followed closely by forget ++ * unless the file or directory is open, in which case the ++ * kernel issues forget only after the release or releasedir ++ * calls. ++ * ++ * Note that if a file system will be exported over NFS the ++ * inodes lifetime must extend even beyond forget. See the ++ * generation field in struct fuse_entry_param above. ++ * ++ * On unmount the lookup count for all inodes implicitly drops ++ * to zero. It is not guaranteed that the file system will ++ * receive corresponding forget messages for the affected ++ * inodes. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param nlookup the number of lookups to forget ++ */ ++ void (*forget)(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); ++ ++ /** ++ * Get file attributes. ++ * ++ * If writeback caching is enabled, the kernel may have a ++ * better idea of a file's length than the FUSE file system ++ * (eg if there has been a write that extended the file size, ++ * but that has not yet been passed to the filesystem.n ++ * ++ * In this case, the st_size value provided by the file system ++ * will be ignored. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi for future use, currently always NULL ++ */ ++ void (*getattr)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Set file attributes ++ * ++ * In the 'attr' argument only members indicated by the 'to_set' ++ * bitmask contain valid values. Other members contain undefined ++ * values. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits if the file ++ * size or owner is being changed. ++ * ++ * If the setattr was invoked from the ftruncate() system call ++ * under Linux kernel versions 2.6.15 or later, the fi->fh will ++ * contain the value set by the open method or will be undefined ++ * if the open method didn't set any value. Otherwise (not ++ * ftruncate call, or kernel version earlier than 2.6.15) the fi ++ * parameter will be NULL. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param attr the attributes ++ * @param to_set bit mask of attributes which should be set ++ * @param fi file information, or NULL ++ */ ++ void (*setattr)(fuse_req_t req, fuse_ino_t ino, struct stat *attr, ++ int to_set, struct fuse_file_info *fi); ++ ++ /** ++ * Read symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_readlink ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ */ ++ void (*readlink)(fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Create file node ++ * ++ * Create a regular file, character device, block device, fifo or ++ * socket node. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param rdev the device number (only valid if created file is a device) ++ */ ++ void (*mknod)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, dev_t rdev); ++ ++ /** ++ * Create a directory ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode with which to create the new file ++ */ ++ void (*mkdir)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode); ++ ++ /** ++ * Remove a file ++ * ++ * If the file's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*unlink)(fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Remove a directory ++ * ++ * If the directory's inode's lookup count is non-zero, the ++ * file system is expected to postpone any removal of the ++ * inode until the lookup count reaches zero (see description ++ * of the forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*rmdir)(fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Create a symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param link the contents of the symbolic link ++ * @param parent inode number of the parent directory ++ * @param name to create ++ */ ++ void (*symlink)(fuse_req_t req, const char *link, fuse_ino_t parent, ++ const char *name); ++ ++ /** ++ * Rename a file ++ * ++ * If the target exists it should be atomically replaced. If ++ * the target's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EINVAL, i.e. all ++ * future bmap requests will fail with EINVAL without being ++ * send to the filesystem process. ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the old parent directory ++ * @param name old name ++ * @param newparent inode number of the new parent directory ++ * @param newname new name ++ */ ++ void (*rename)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags); ++ ++ /** ++ * Create a hard link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the old inode number ++ * @param newparent inode number of the new parent directory ++ * @param newname new name to create ++ */ ++ void (*link)(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, ++ const char *newname); ++ ++ /** ++ * Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used ++ * by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount ++ * option is given, this check is already done by the ++ * kernel before calling open() and may thus be omitted by ++ * the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open and release will also succeed without being ++ * sent to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*open)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Read data ++ * ++ * Read should send exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the file ++ * has been opened in 'direct_io' mode, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_iov ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size number of bytes to read ++ * @param off offset to read from ++ * @param fi file information ++ */ ++ void (*read)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Write data ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the file has ++ * been opened in 'direct_io' mode, in which case the return value ++ * of the write system call will reflect the return value of this ++ * operation. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param buf data to write ++ * @param size number of bytes to write ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write)(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, ++ off_t off, struct fuse_file_info *fi); ++ ++ /** ++ * Flush method ++ * ++ * This is called on each close() of the opened file. ++ * ++ * Since file descriptors can be duplicated (dup, dup2, fork), for ++ * one open call there may be many flush calls. ++ * ++ * Filesystems shouldn't assume that flush will always be called ++ * after some writes, or that if will be called at all. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * NOTE: the name of the method is misleading, since (unlike ++ * fsync) the filesystem is not forced to flush pending writes. ++ * One reason to flush data is if the filesystem wants to return ++ * write errors during close. However, such use is non-portable ++ * because POSIX does not require [close] to wait for delayed I/O to ++ * complete. ++ * ++ * If the filesystem supports file locking operations (setlk, ++ * getlk) it should remove all locks belonging to 'fi->owner'. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to flush() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * ++ * [close]: ++ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ void (*flush)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open call there will be exactly one release call (unless ++ * the filesystem is force-unmounted). ++ * ++ * The filesystem may reply with an error, but error values are ++ * not returned to close() or munmap() which triggered the ++ * release. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * fi->flags will contain the same flags as for open. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*release)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsync() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsync)(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Open a directory ++ * ++ * Filesystem may store an arbitrary file handle (pointer, index, ++ * etc) in fi->fh, and use this in other all other directory ++ * stream operations (readdir, releasedir, fsyncdir). ++ * ++ * If this request is answered with an error code of ENOSYS and ++ * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, ++ * this is treated as success and future calls to opendir and ++ * releasedir will also succeed without being sent to the filesystem ++ * process. In addition, the kernel will cache readdir results ++ * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*opendir)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Read directory ++ * ++ * Send a buffer filled using fuse_add_direntry(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Returning a directory entry from readdir() does not affect ++ * its lookup count. ++ * ++ * If off_t is non-zero, then it will correspond to one of the off_t ++ * values that was previously returned by readdir() for the same ++ * directory handle. In this case, readdir() should skip over entries ++ * coming before the position defined by the off_t value. If entries ++ * are added or removed while the directory handle is open, they filesystem ++ * may still include the entries that have been removed, and may not ++ * report the entries that have been created. However, addition or ++ * removal of entries must never cause readdir() to skip over unrelated ++ * entries or to report them more than once. This means ++ * that off_t can not be a simple index that enumerates the entries ++ * that have been returned but must contain sufficient information to ++ * uniquely determine the next directory entry to return even when the ++ * set of entries is changing. ++ * ++ * The function does not have to report the '.' and '..' ++ * entries, but is allowed to do so. Note that, if readdir does ++ * not return '.' or '..', they will not be implicitly returned, ++ * and this behavior is observable by the caller. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdir)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Release an open directory ++ * ++ * For every opendir call there will be exactly one releasedir ++ * call (unless the filesystem is force-unmounted). ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*releasedir)(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the directory ++ * contents should be flushed, not the meta data. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsyncdir() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsyncdir)(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Get file system statistics ++ * ++ * Valid replies: ++ * fuse_reply_statfs ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number, zero means "undefined" ++ */ ++ void (*statfs)(fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Set an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future setxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ */ ++ void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, ++ const char *value, size_t size, int flags); ++ ++ /** ++ * Get an extended attribute ++ * ++ * If size is zero, the size of the value should be sent with ++ * fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the value fits in the buffer, the ++ * value should be sent with fuse_reply_buf. ++ * ++ * If the size is too small for the value, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future getxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ * @param size maximum size of the value to send ++ */ ++ void (*getxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, ++ size_t size); ++ ++ /** ++ * List extended attribute names ++ * ++ * If size is zero, the total size of the attribute list should be ++ * sent with fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the null character separated ++ * attribute list fits in the buffer, the list should be sent with ++ * fuse_reply_buf. ++ * ++ * If the size is too small for the list, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future listxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum size of the list to send ++ */ ++ void (*listxattr)(fuse_req_t req, fuse_ino_t ino, size_t size); ++ ++ /** ++ * Remove an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future removexattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ */ ++ void (*removexattr)(fuse_req_t req, fuse_ino_t ino, const char *name); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() and chdir() system ++ * calls. If the 'default_permissions' mount option is given, ++ * this method is not called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent success, i.e. this and all future access() ++ * requests will succeed without being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param mask requested access mode ++ */ ++ void (*access)(fuse_req_t req, fuse_ino_t ino, int mask); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * See the description of the open handler for more ++ * information. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ * ++ * If this request is answered with an error code of ENOSYS, the handler ++ * is treated as not implemented (i.e., for this and future requests the ++ * mknod() and open() handlers will be called instead). ++ * ++ * Valid replies: ++ * fuse_reply_create ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param fi file information ++ */ ++ void (*create)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, struct fuse_file_info *fi); ++ ++ /** ++ * Test for a POSIX file lock ++ * ++ * Valid replies: ++ * fuse_reply_lock ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to test ++ */ ++ void (*getlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock); ++ ++ /** ++ * Acquire, modify or release a POSIX file lock ++ * ++ * For POSIX threads (NPTL) there's a 1-1 relation between pid and ++ * owner, but otherwise this is not always the case. For checking ++ * lock ownership, 'fi->owner' must be used. The l_pid field in ++ * 'struct flock' should only be used to fill in this field in ++ * getlk(). ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to set ++ * @param sleep locking operation may sleep ++ */ ++ void (*setlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock, int sleep); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future bmap() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_bmap ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param blocksize unit of block index ++ * @param idx block index within file ++ */ ++ void (*bmap)(fuse_req_t req, fuse_ino_t ino, size_t blocksize, ++ uint64_t idx); ++ ++ /** ++ * Ioctl ++ * ++ * Note: For unrestricted ioctls (not allowed for FUSE ++ * servers), data in and out areas can be discovered by giving ++ * iovs and setting FUSE_IOCTL_RETRY in *flags*. For ++ * restricted ioctls, kernel prepares in/out data area ++ * according to the information encoded in cmd. ++ * ++ * Valid replies: ++ * fuse_reply_ioctl_retry ++ * fuse_reply_ioctl ++ * fuse_reply_ioctl_iov ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param cmd ioctl command ++ * @param arg ioctl argument ++ * @param fi file information ++ * @param flags for FUSE_IOCTL_* flags ++ * @param in_buf data fetched from the caller ++ * @param in_bufsz number of fetched bytes ++ * @param out_bufsz maximum size of output data ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ void (*ioctl)(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg, ++ struct fuse_file_info *fi, unsigned flags, const void *in_buf, ++ size_t in_bufsz, size_t out_bufsz); ++ ++ /** ++ * Poll for IO readiness ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_lowlevel_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as success (with a kernel-defined default poll-mask) and ++ * future calls to pull() will succeed the same way without being send ++ * to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_poll ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param ph poll handle to be used for notification ++ */ ++ void (*poll)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct fuse_pollhandle *ph); ++ ++ /** ++ * Write data made available in a buffer ++ * ++ * This is a more generic version of the ->write() method. If ++ * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the ++ * kernel supports splicing from the fuse device, then the ++ * data will be made available in pipe for supporting zero ++ * copy data transfer. ++ * ++ * buf->count is guaranteed to be one (and thus buf->idx is ++ * always zero). The write_buf handler must ensure that ++ * bufv->off is correctly updated (reflecting the number of ++ * bytes read from bufv->buf[0]). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param bufv buffer containing the data ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write_buf)(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, ++ off_t off, struct fuse_file_info *fi); ++ ++ /** ++ * Callback function for the retrieve request ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() ++ * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() ++ * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() ++ * @param bufv the buffer containing the returned data ++ */ ++ void (*retrieve_reply)(fuse_req_t req, void *cookie, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv); ++ ++ /** ++ * Forget about multiple inodes ++ * ++ * See description of the forget function for more ++ * information. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ */ ++ void (*forget_multi)(fuse_req_t req, size_t count, ++ struct fuse_forget_data *forgets); ++ ++ /** ++ * Acquire, modify or release a BSD file lock ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param op the locking operation, see flock(2) ++ */ ++ void (*flock)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ int op); ++ ++ /** ++ * Allocate requested space. If this function returns success then ++ * subsequent writes to the specified range shall not fail due to the lack ++ * of free space on the file system storage media. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future fallocate() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param offset starting point for allocated region ++ * @param length size of allocated region ++ * @param mode determines the operation to be performed on the given range, ++ * see fallocate(2) ++ */ ++ void (*fallocate)(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, ++ off_t length, struct fuse_file_info *fi); ++ ++ /** ++ * Read directory with attributes ++ * ++ * Send a buffer filled using fuse_add_direntry_plus(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * In contrast to readdir() (which does not affect the lookup counts), ++ * the lookup count of every entry returned by readdirplus(), except "." ++ * and "..", is incremented by one. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future copy_file_range() requests will fail with EOPNOTSUPP without ++ * being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino_in the inode number or the source file ++ * @param off_in starting point from were the data should be read ++ * @param fi_in file information of the source file ++ * @param ino_out the inode number or the destination file ++ * @param off_out starting point where the data should be written ++ * @param fi_out file information of the destination file ++ * @param len maximum size of the data to copy ++ * @param flags passed along with the copy_file_range() syscall ++ */ ++ void (*copy_file_range)(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, ++ struct fuse_file_info *fi_in, fuse_ino_t ino_out, ++ off_t off_out, struct fuse_file_info *fi_out, ++ size_t len, int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future lseek() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_lseek ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param off offset to start search from ++ * @param whence either SEEK_DATA or SEEK_HOLE ++ * @param fi file information ++ */ ++ void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, ++ struct fuse_file_info *fi); + }; + + /** +@@ -1305,7 +1307,7 @@ int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e); + * @return zero for success, -errno for failure to send reply + */ + int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, +- const struct fuse_file_info *fi); ++ const struct fuse_file_info *fi); + + /** + * Reply with attributes +@@ -1315,11 +1317,11 @@ int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + * + * @param req request handle + * @param attr the attributes +- * @param attr_timeout validity timeout (in seconds) for the attributes ++ * @param attr_timeout validity timeout (in seconds) for the attributes + * @return zero for success, -errno for failure to send reply + */ + int fuse_reply_attr(fuse_req_t req, const struct stat *attr, +- double attr_timeout); ++ double attr_timeout); + + /** + * Reply with the contents of a symbolic link +@@ -1417,7 +1419,7 @@ int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + * @return zero for success, -errno for failure to send reply + */ + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++ enum fuse_buf_copy_flags flags); + + /** + * Reply with data vector +@@ -1480,9 +1482,9 @@ int fuse_reply_lock(fuse_req_t req, const struct flock *lock); + */ + int fuse_reply_bmap(fuse_req_t req, uint64_t idx); + +-/* ----------------------------------------------------------- * +- * Filling a buffer in readdir * +- * ----------------------------------------------------------- */ ++/* ++ * Filling a buffer in readdir ++ */ + + /** + * Add a directory entry to the buffer +@@ -1512,8 +1514,7 @@ int fuse_reply_bmap(fuse_req_t req, uint64_t idx); + * @return the space needed for the entry + */ + size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, const struct stat *stbuf, +- off_t off); ++ const char *name, const struct stat *stbuf, off_t off); + + /** + * Add a directory entry to the buffer with the attributes +@@ -1529,8 +1530,8 @@ size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + * @return the space needed for the entry + */ + size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, +- const struct fuse_entry_param *e, off_t off); ++ const char *name, ++ const struct fuse_entry_param *e, off_t off); + + /** + * Reply to ask for data fetch and output buffer preparation. ioctl +@@ -1547,9 +1548,9 @@ size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + * @param out_count number of entries in out_iov + * @return zero for success, -errno for failure to send reply + */ +-int fuse_reply_ioctl_retry(fuse_req_t req, +- const struct iovec *in_iov, size_t in_count, +- const struct iovec *out_iov, size_t out_count); ++int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, ++ size_t in_count, const struct iovec *out_iov, ++ size_t out_count); + + /** + * Reply to finish ioctl +@@ -1576,7 +1577,7 @@ int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size); + * @param count the size of vector + */ + int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, +- int count); ++ int count); + + /** + * Reply with poll result event mask +@@ -1598,9 +1599,9 @@ int fuse_reply_poll(fuse_req_t req, unsigned revents); + */ + int fuse_reply_lseek(fuse_req_t req, off_t off); + +-/* ----------------------------------------------------------- * +- * Notification * +- * ----------------------------------------------------------- */ ++/* ++ * Notification ++ */ + + /** + * Notify IO readiness event +@@ -1635,7 +1636,7 @@ int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph); + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, +- off_t off, off_t len); ++ off_t off, off_t len); + + /** + * Notify to invalidate parent attributes and the dentry matching +@@ -1663,7 +1664,7 @@ int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, +- const char *name, size_t namelen); ++ const char *name, size_t namelen); + + /** + * This function behaves like fuse_lowlevel_notify_inval_entry() with +@@ -1693,9 +1694,9 @@ int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + * @param namelen strlen() of file name + * @return zero for success, -errno for failure + */ +-int fuse_lowlevel_notify_delete(struct fuse_session *se, +- fuse_ino_t parent, fuse_ino_t child, +- const char *name, size_t namelen); ++int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, ++ fuse_ino_t child, const char *name, ++ size_t namelen); + + /** + * Store data to the kernel buffers +@@ -1723,8 +1724,8 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags); + /** + * Retrieve data from the kernel buffers + * +@@ -1755,12 +1756,12 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie); ++ size_t size, off_t offset, void *cookie); + + +-/* ----------------------------------------------------------- * +- * Utility functions * +- * ----------------------------------------------------------- */ ++/* ++ * Utility functions ++ */ + + /** + * Get the userdata from the request +@@ -1822,7 +1823,7 @@ typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data); + * @param data user data passed to the callback function + */ + void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, +- void *data); ++ void *data); + + /** + * Check if a request has already been interrupted +@@ -1833,9 +1834,9 @@ void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + int fuse_req_interrupted(fuse_req_t req); + + +-/* ----------------------------------------------------------- * +- * Inquiry functions * +- * ----------------------------------------------------------- */ ++/* ++ * Inquiry functions ++ */ + + /** + * Print low-level version information to stdout. +@@ -1854,18 +1855,18 @@ void fuse_lowlevel_help(void); + */ + void fuse_cmdline_help(void); + +-/* ----------------------------------------------------------- * +- * Filesystem setup & teardown * +- * ----------------------------------------------------------- */ ++/* ++ * Filesystem setup & teardown ++ */ + + struct fuse_cmdline_opts { +- int foreground; +- int debug; +- int nodefault_subtype; +- char *mountpoint; +- int show_version; +- int show_help; +- unsigned int max_idle_threads; ++ int foreground; ++ int debug; ++ int nodefault_subtype; ++ char *mountpoint; ++ int show_version; ++ int show_help; ++ unsigned int max_idle_threads; + }; + + /** +@@ -1886,8 +1887,7 @@ struct fuse_cmdline_opts { + * @param opts output argument for parsed options + * @return 0 on success, -1 on failure + */ +-int fuse_parse_cmdline(struct fuse_args *args, +- struct fuse_cmdline_opts *opts); ++int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts); + + /** + * Create a low level session. +@@ -1918,8 +1918,8 @@ int fuse_parse_cmdline(struct fuse_args *args, + * @return the fuse session on success, NULL on failure + **/ + struct fuse_session *fuse_session_new(struct fuse_args *args, +- const struct fuse_lowlevel_ops *op, +- size_t op_size, void *userdata); ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata); + + /** + * Mount a FUSE file system. +@@ -2014,9 +2014,9 @@ void fuse_session_unmount(struct fuse_session *se); + */ + void fuse_session_destroy(struct fuse_session *se); + +-/* ----------------------------------------------------------- * +- * Custom event loop support * +- * ----------------------------------------------------------- */ ++/* ++ * Custom event loop support ++ */ + + /** + * Return file descriptor for communication with kernel. +@@ -2043,7 +2043,7 @@ int fuse_session_fd(struct fuse_session *se); + * @param buf the fuse_buf containing the request + */ + void fuse_session_process_buf(struct fuse_session *se, +- const struct fuse_buf *buf); ++ const struct fuse_buf *buf); + + /** + * Read a raw request from the kernel into the supplied buffer. +diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h +index 2f6663e..f252baa 100644 +--- a/tools/virtiofsd/fuse_misc.h ++++ b/tools/virtiofsd/fuse_misc.h +@@ -1,18 +1,18 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include + + /* +- Versioned symbols cannot be used in some cases because it +- - confuse the dynamic linker in uClibc +- - not supported on MacOSX (in MachO binary format) +-*/ ++ * Versioned symbols cannot be used in some cases because it ++ * - confuse the dynamic linker in uClibc ++ * - not supported on MacOSX (in MachO binary format) ++ */ + #if (!defined(__UCLIBC__) && !defined(__APPLE__)) + #define FUSE_SYMVER(x) __asm__(x) + #else +@@ -25,11 +25,11 @@ + /* Is this hack still needed? */ + static inline void fuse_mutex_init(pthread_mutex_t *mut) + { +- pthread_mutexattr_t attr; +- pthread_mutexattr_init(&attr); +- pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); +- pthread_mutex_init(mut, &attr); +- pthread_mutexattr_destroy(&attr); ++ pthread_mutexattr_t attr; ++ pthread_mutexattr_init(&attr); ++ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); ++ pthread_mutex_init(mut, &attr); ++ pthread_mutexattr_destroy(&attr); + } + #endif + +diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c +index 93066b9..edd36f4 100644 +--- a/tools/virtiofsd/fuse_opt.c ++++ b/tools/virtiofsd/fuse_opt.c +@@ -1,423 +1,450 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- Implementation of option parsing routines (dealing with `struct +- fuse_args`). +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Implementation of option parsing routines (dealing with `struct ++ * fuse_args`). ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + ++#include "fuse_opt.h" + #include "config.h" + #include "fuse_i.h" +-#include "fuse_opt.h" + #include "fuse_misc.h" + ++#include + #include + #include + #include +-#include + + struct fuse_opt_context { +- void *data; +- const struct fuse_opt *opt; +- fuse_opt_proc_t proc; +- int argctr; +- int argc; +- char **argv; +- struct fuse_args outargs; +- char *opts; +- int nonopt; ++ void *data; ++ const struct fuse_opt *opt; ++ fuse_opt_proc_t proc; ++ int argctr; ++ int argc; ++ char **argv; ++ struct fuse_args outargs; ++ char *opts; ++ int nonopt; + }; + + void fuse_opt_free_args(struct fuse_args *args) + { +- if (args) { +- if (args->argv && args->allocated) { +- int i; +- for (i = 0; i < args->argc; i++) +- free(args->argv[i]); +- free(args->argv); +- } +- args->argc = 0; +- args->argv = NULL; +- args->allocated = 0; +- } ++ if (args) { ++ if (args->argv && args->allocated) { ++ int i; ++ for (i = 0; i < args->argc; i++) { ++ free(args->argv[i]); ++ } ++ free(args->argv); ++ } ++ args->argc = 0; ++ args->argv = NULL; ++ args->allocated = 0; ++ } + } + + static int alloc_failed(void) + { +- fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); +- return -1; ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; + } + + int fuse_opt_add_arg(struct fuse_args *args, const char *arg) + { +- char **newargv; +- char *newarg; +- +- assert(!args->argv || args->allocated); +- +- newarg = strdup(arg); +- if (!newarg) +- return alloc_failed(); +- +- newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); +- if (!newargv) { +- free(newarg); +- return alloc_failed(); +- } +- +- args->argv = newargv; +- args->allocated = 1; +- args->argv[args->argc++] = newarg; +- args->argv[args->argc] = NULL; +- return 0; ++ char **newargv; ++ char *newarg; ++ ++ assert(!args->argv || args->allocated); ++ ++ newarg = strdup(arg); ++ if (!newarg) { ++ return alloc_failed(); ++ } ++ ++ newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); ++ if (!newargv) { ++ free(newarg); ++ return alloc_failed(); ++ } ++ ++ args->argv = newargv; ++ args->allocated = 1; ++ args->argv[args->argc++] = newarg; ++ args->argv[args->argc] = NULL; ++ return 0; + } + + static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos, +- const char *arg) ++ const char *arg) + { +- assert(pos <= args->argc); +- if (fuse_opt_add_arg(args, arg) == -1) +- return -1; +- +- if (pos != args->argc - 1) { +- char *newarg = args->argv[args->argc - 1]; +- memmove(&args->argv[pos + 1], &args->argv[pos], +- sizeof(char *) * (args->argc - pos - 1)); +- args->argv[pos] = newarg; +- } +- return 0; ++ assert(pos <= args->argc); ++ if (fuse_opt_add_arg(args, arg) == -1) { ++ return -1; ++ } ++ ++ if (pos != args->argc - 1) { ++ char *newarg = args->argv[args->argc - 1]; ++ memmove(&args->argv[pos + 1], &args->argv[pos], ++ sizeof(char *) * (args->argc - pos - 1)); ++ args->argv[pos] = newarg; ++ } ++ return 0; + } + + int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg) + { +- return fuse_opt_insert_arg_common(args, pos, arg); ++ return fuse_opt_insert_arg_common(args, pos, arg); + } + + static int next_arg(struct fuse_opt_context *ctx, const char *opt) + { +- if (ctx->argctr + 1 >= ctx->argc) { +- fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); +- return -1; +- } +- ctx->argctr++; +- return 0; ++ if (ctx->argctr + 1 >= ctx->argc) { ++ fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); ++ return -1; ++ } ++ ctx->argctr++; ++ return 0; + } + + static int add_arg(struct fuse_opt_context *ctx, const char *arg) + { +- return fuse_opt_add_arg(&ctx->outargs, arg); ++ return fuse_opt_add_arg(&ctx->outargs, arg); + } + + static int add_opt_common(char **opts, const char *opt, int esc) + { +- unsigned oldlen = *opts ? strlen(*opts) : 0; +- char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); +- +- if (!d) +- return alloc_failed(); +- +- *opts = d; +- if (oldlen) { +- d += oldlen; +- *d++ = ','; +- } +- +- for (; *opt; opt++) { +- if (esc && (*opt == ',' || *opt == '\\')) +- *d++ = '\\'; +- *d++ = *opt; +- } +- *d = '\0'; +- +- return 0; ++ unsigned oldlen = *opts ? strlen(*opts) : 0; ++ char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); ++ ++ if (!d) { ++ return alloc_failed(); ++ } ++ ++ *opts = d; ++ if (oldlen) { ++ d += oldlen; ++ *d++ = ','; ++ } ++ ++ for (; *opt; opt++) { ++ if (esc && (*opt == ',' || *opt == '\\')) { ++ *d++ = '\\'; ++ } ++ *d++ = *opt; ++ } ++ *d = '\0'; ++ ++ return 0; + } + + int fuse_opt_add_opt(char **opts, const char *opt) + { +- return add_opt_common(opts, opt, 0); ++ return add_opt_common(opts, opt, 0); + } + + int fuse_opt_add_opt_escaped(char **opts, const char *opt) + { +- return add_opt_common(opts, opt, 1); ++ return add_opt_common(opts, opt, 1); + } + + static int add_opt(struct fuse_opt_context *ctx, const char *opt) + { +- return add_opt_common(&ctx->opts, opt, 1); ++ return add_opt_common(&ctx->opts, opt, 1); + } + + static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key, +- int iso) ++ int iso) + { +- if (key == FUSE_OPT_KEY_DISCARD) +- return 0; +- +- if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { +- int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); +- if (res == -1 || !res) +- return res; +- } +- if (iso) +- return add_opt(ctx, arg); +- else +- return add_arg(ctx, arg); ++ if (key == FUSE_OPT_KEY_DISCARD) { ++ return 0; ++ } ++ ++ if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { ++ int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); ++ if (res == -1 || !res) { ++ return res; ++ } ++ } ++ if (iso) { ++ return add_opt(ctx, arg); ++ } else { ++ return add_arg(ctx, arg); ++ } + } + + static int match_template(const char *t, const char *arg, unsigned *sepp) + { +- int arglen = strlen(arg); +- const char *sep = strchr(t, '='); +- sep = sep ? sep : strchr(t, ' '); +- if (sep && (!sep[1] || sep[1] == '%')) { +- int tlen = sep - t; +- if (sep[0] == '=') +- tlen ++; +- if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { +- *sepp = sep - t; +- return 1; +- } +- } +- if (strcmp(t, arg) == 0) { +- *sepp = 0; +- return 1; +- } +- return 0; ++ int arglen = strlen(arg); ++ const char *sep = strchr(t, '='); ++ sep = sep ? sep : strchr(t, ' '); ++ if (sep && (!sep[1] || sep[1] == '%')) { ++ int tlen = sep - t; ++ if (sep[0] == '=') { ++ tlen++; ++ } ++ if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { ++ *sepp = sep - t; ++ return 1; ++ } ++ } ++ if (strcmp(t, arg) == 0) { ++ *sepp = 0; ++ return 1; ++ } ++ return 0; + } + + static const struct fuse_opt *find_opt(const struct fuse_opt *opt, +- const char *arg, unsigned *sepp) ++ const char *arg, unsigned *sepp) + { +- for (; opt && opt->templ; opt++) +- if (match_template(opt->templ, arg, sepp)) +- return opt; +- return NULL; ++ for (; opt && opt->templ; opt++) { ++ if (match_template(opt->templ, arg, sepp)) { ++ return opt; ++ } ++ } ++ return NULL; + } + + int fuse_opt_match(const struct fuse_opt *opts, const char *opt) + { +- unsigned dummy; +- return find_opt(opts, opt, &dummy) ? 1 : 0; ++ unsigned dummy; ++ return find_opt(opts, opt, &dummy) ? 1 : 0; + } + + static int process_opt_param(void *var, const char *format, const char *param, +- const char *arg) ++ const char *arg) + { +- assert(format[0] == '%'); +- if (format[1] == 's') { +- char **s = var; +- char *copy = strdup(param); +- if (!copy) +- return alloc_failed(); +- +- free(*s); +- *s = copy; +- } else { +- if (sscanf(param, format, var) != 1) { +- fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", arg); +- return -1; +- } +- } +- return 0; ++ assert(format[0] == '%'); ++ if (format[1] == 's') { ++ char **s = var; ++ char *copy = strdup(param); ++ if (!copy) { ++ return alloc_failed(); ++ } ++ ++ free(*s); ++ *s = copy; ++ } else { ++ if (sscanf(param, format, var) != 1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", ++ arg); ++ return -1; ++ } ++ } ++ return 0; + } + +-static int process_opt(struct fuse_opt_context *ctx, +- const struct fuse_opt *opt, unsigned sep, +- const char *arg, int iso) ++static int process_opt(struct fuse_opt_context *ctx, const struct fuse_opt *opt, ++ unsigned sep, const char *arg, int iso) + { +- if (opt->offset == -1U) { +- if (call_proc(ctx, arg, opt->value, iso) == -1) +- return -1; +- } else { +- void *var = (char *)ctx->data + opt->offset; +- if (sep && opt->templ[sep + 1]) { +- const char *param = arg + sep; +- if (opt->templ[sep] == '=') +- param ++; +- if (process_opt_param(var, opt->templ + sep + 1, +- param, arg) == -1) +- return -1; +- } else +- *(int *)var = opt->value; +- } +- return 0; ++ if (opt->offset == -1U) { ++ if (call_proc(ctx, arg, opt->value, iso) == -1) { ++ return -1; ++ } ++ } else { ++ void *var = (char *)ctx->data + opt->offset; ++ if (sep && opt->templ[sep + 1]) { ++ const char *param = arg + sep; ++ if (opt->templ[sep] == '=') { ++ param++; ++ } ++ if (process_opt_param(var, opt->templ + sep + 1, param, arg) == ++ -1) { ++ return -1; ++ } ++ } else { ++ *(int *)var = opt->value; ++ } ++ } ++ return 0; + } + + static int process_opt_sep_arg(struct fuse_opt_context *ctx, +- const struct fuse_opt *opt, unsigned sep, +- const char *arg, int iso) ++ const struct fuse_opt *opt, unsigned sep, ++ const char *arg, int iso) + { +- int res; +- char *newarg; +- char *param; +- +- if (next_arg(ctx, arg) == -1) +- return -1; +- +- param = ctx->argv[ctx->argctr]; +- newarg = malloc(sep + strlen(param) + 1); +- if (!newarg) +- return alloc_failed(); +- +- memcpy(newarg, arg, sep); +- strcpy(newarg + sep, param); +- res = process_opt(ctx, opt, sep, newarg, iso); +- free(newarg); +- +- return res; ++ int res; ++ char *newarg; ++ char *param; ++ ++ if (next_arg(ctx, arg) == -1) { ++ return -1; ++ } ++ ++ param = ctx->argv[ctx->argctr]; ++ newarg = malloc(sep + strlen(param) + 1); ++ if (!newarg) { ++ return alloc_failed(); ++ } ++ ++ memcpy(newarg, arg, sep); ++ strcpy(newarg + sep, param); ++ res = process_opt(ctx, opt, sep, newarg, iso); ++ free(newarg); ++ ++ return res; + } + + static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso) + { +- unsigned sep; +- const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); +- if (opt) { +- for (; opt; opt = find_opt(opt + 1, arg, &sep)) { +- int res; +- if (sep && opt->templ[sep] == ' ' && !arg[sep]) +- res = process_opt_sep_arg(ctx, opt, sep, arg, +- iso); +- else +- res = process_opt(ctx, opt, sep, arg, iso); +- if (res == -1) +- return -1; +- } +- return 0; +- } else +- return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); ++ unsigned sep; ++ const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); ++ if (opt) { ++ for (; opt; opt = find_opt(opt + 1, arg, &sep)) { ++ int res; ++ if (sep && opt->templ[sep] == ' ' && !arg[sep]) { ++ res = process_opt_sep_arg(ctx, opt, sep, arg, iso); ++ } else { ++ res = process_opt(ctx, opt, sep, arg, iso); ++ } ++ if (res == -1) { ++ return -1; ++ } ++ } ++ return 0; ++ } else { ++ return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); ++ } + } + + static int process_real_option_group(struct fuse_opt_context *ctx, char *opts) + { +- char *s = opts; +- char *d = s; +- int end = 0; +- +- while (!end) { +- if (*s == '\0') +- end = 1; +- if (*s == ',' || end) { +- int res; +- +- *d = '\0'; +- res = process_gopt(ctx, opts, 1); +- if (res == -1) +- return -1; +- d = opts; +- } else { +- if (s[0] == '\\' && s[1] != '\0') { +- s++; +- if (s[0] >= '0' && s[0] <= '3' && +- s[1] >= '0' && s[1] <= '7' && +- s[2] >= '0' && s[2] <= '7') { +- *d++ = (s[0] - '0') * 0100 + +- (s[1] - '0') * 0010 + +- (s[2] - '0'); +- s += 2; +- } else { +- *d++ = *s; +- } +- } else { +- *d++ = *s; +- } +- } +- s++; +- } +- +- return 0; ++ char *s = opts; ++ char *d = s; ++ int end = 0; ++ ++ while (!end) { ++ if (*s == '\0') { ++ end = 1; ++ } ++ if (*s == ',' || end) { ++ int res; ++ ++ *d = '\0'; ++ res = process_gopt(ctx, opts, 1); ++ if (res == -1) { ++ return -1; ++ } ++ d = opts; ++ } else { ++ if (s[0] == '\\' && s[1] != '\0') { ++ s++; ++ if (s[0] >= '0' && s[0] <= '3' && s[1] >= '0' && s[1] <= '7' && ++ s[2] >= '0' && s[2] <= '7') { ++ *d++ = (s[0] - '0') * 0100 + (s[1] - '0') * 0010 + ++ (s[2] - '0'); ++ s += 2; ++ } else { ++ *d++ = *s; ++ } ++ } else { ++ *d++ = *s; ++ } ++ } ++ s++; ++ } ++ ++ return 0; + } + + static int process_option_group(struct fuse_opt_context *ctx, const char *opts) + { +- int res; +- char *copy = strdup(opts); +- +- if (!copy) { +- fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); +- return -1; +- } +- res = process_real_option_group(ctx, copy); +- free(copy); +- return res; ++ int res; ++ char *copy = strdup(opts); ++ ++ if (!copy) { ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++ } ++ res = process_real_option_group(ctx, copy); ++ free(copy); ++ return res; + } + + static int process_one(struct fuse_opt_context *ctx, const char *arg) + { +- if (ctx->nonopt || arg[0] != '-') +- return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); +- else if (arg[1] == 'o') { +- if (arg[2]) +- return process_option_group(ctx, arg + 2); +- else { +- if (next_arg(ctx, arg) == -1) +- return -1; +- +- return process_option_group(ctx, +- ctx->argv[ctx->argctr]); +- } +- } else if (arg[1] == '-' && !arg[2]) { +- if (add_arg(ctx, arg) == -1) +- return -1; +- ctx->nonopt = ctx->outargs.argc; +- return 0; +- } else +- return process_gopt(ctx, arg, 0); ++ if (ctx->nonopt || arg[0] != '-') { ++ return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); ++ } else if (arg[1] == 'o') { ++ if (arg[2]) { ++ return process_option_group(ctx, arg + 2); ++ } else { ++ if (next_arg(ctx, arg) == -1) { ++ return -1; ++ } ++ ++ return process_option_group(ctx, ctx->argv[ctx->argctr]); ++ } ++ } else if (arg[1] == '-' && !arg[2]) { ++ if (add_arg(ctx, arg) == -1) { ++ return -1; ++ } ++ ctx->nonopt = ctx->outargs.argc; ++ return 0; ++ } else { ++ return process_gopt(ctx, arg, 0); ++ } + } + + static int opt_parse(struct fuse_opt_context *ctx) + { +- if (ctx->argc) { +- if (add_arg(ctx, ctx->argv[0]) == -1) +- return -1; +- } +- +- for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) +- if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) +- return -1; +- +- if (ctx->opts) { +- if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || +- fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) +- return -1; +- } +- +- /* If option separator ("--") is the last argument, remove it */ +- if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && +- strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { +- free(ctx->outargs.argv[ctx->outargs.argc - 1]); +- ctx->outargs.argv[--ctx->outargs.argc] = NULL; +- } +- +- return 0; ++ if (ctx->argc) { ++ if (add_arg(ctx, ctx->argv[0]) == -1) { ++ return -1; ++ } ++ } ++ ++ for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) { ++ if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) { ++ return -1; ++ } ++ } ++ ++ if (ctx->opts) { ++ if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || ++ fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) { ++ return -1; ++ } ++ } ++ ++ /* If option separator ("--") is the last argument, remove it */ ++ if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && ++ strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { ++ free(ctx->outargs.argv[ctx->outargs.argc - 1]); ++ ctx->outargs.argv[--ctx->outargs.argc] = NULL; ++ } ++ ++ return 0; + } + + int fuse_opt_parse(struct fuse_args *args, void *data, +- const struct fuse_opt opts[], fuse_opt_proc_t proc) ++ const struct fuse_opt opts[], fuse_opt_proc_t proc) + { +- int res; +- struct fuse_opt_context ctx = { +- .data = data, +- .opt = opts, +- .proc = proc, +- }; +- +- if (!args || !args->argv || !args->argc) +- return 0; +- +- ctx.argc = args->argc; +- ctx.argv = args->argv; +- +- res = opt_parse(&ctx); +- if (res != -1) { +- struct fuse_args tmp = *args; +- *args = ctx.outargs; +- ctx.outargs = tmp; +- } +- free(ctx.opts); +- fuse_opt_free_args(&ctx.outargs); +- return res; ++ int res; ++ struct fuse_opt_context ctx = { ++ .data = data, ++ .opt = opts, ++ .proc = proc, ++ }; ++ ++ if (!args || !args->argv || !args->argc) { ++ return 0; ++ } ++ ++ ctx.argc = args->argc; ++ ctx.argv = args->argv; ++ ++ res = opt_parse(&ctx); ++ if (res != -1) { ++ struct fuse_args tmp = *args; ++ *args = ctx.outargs; ++ ctx.outargs = tmp; ++ } ++ free(ctx.opts); ++ fuse_opt_free_args(&ctx.outargs); ++ return res; + } +diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h +index 6910255..8f59b4d 100644 +--- a/tools/virtiofsd/fuse_opt.h ++++ b/tools/virtiofsd/fuse_opt.h +@@ -1,10 +1,10 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_OPT_H_ + #define FUSE_OPT_H_ +@@ -37,7 +37,7 @@ + * + * - 'offsetof(struct foo, member)' actions i) and iii) + * +- * - -1 action ii) ++ * - -1 action ii) + * + * The 'offsetof()' macro is defined in the header. + * +@@ -48,7 +48,7 @@ + * + * The types of templates are: + * +- * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only ++ * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only + * themselves. Invalid values are "--" and anything beginning + * with "-o" + * +@@ -71,58 +71,67 @@ + * freed. + */ + struct fuse_opt { +- /** Matching template and optional parameter formatting */ +- const char *templ; ++ /** Matching template and optional parameter formatting */ ++ const char *templ; + +- /** +- * Offset of variable within 'data' parameter of fuse_opt_parse() +- * or -1 +- */ +- unsigned long offset; ++ /** ++ * Offset of variable within 'data' parameter of fuse_opt_parse() ++ * or -1 ++ */ ++ unsigned long offset; + +- /** +- * Value to set the variable to, or to be passed as 'key' to the +- * processing function. Ignored if template has a format +- */ +- int value; ++ /** ++ * Value to set the variable to, or to be passed as 'key' to the ++ * processing function. Ignored if template has a format ++ */ ++ int value; + }; + + /** +- * Key option. In case of a match, the processing function will be ++ * Key option. In case of a match, the processing function will be + * called with the specified key. + */ +-#define FUSE_OPT_KEY(templ, key) { templ, -1U, key } ++#define FUSE_OPT_KEY(templ, key) \ ++ { \ ++ templ, -1U, key \ ++ } + + /** +- * Last option. An array of 'struct fuse_opt' must end with a NULL ++ * Last option. An array of 'struct fuse_opt' must end with a NULL + * template value + */ +-#define FUSE_OPT_END { NULL, 0, 0 } ++#define FUSE_OPT_END \ ++ { \ ++ NULL, 0, 0 \ ++ } + + /** + * Argument list + */ + struct fuse_args { +- /** Argument count */ +- int argc; ++ /** Argument count */ ++ int argc; + +- /** Argument vector. NULL terminated */ +- char **argv; ++ /** Argument vector. NULL terminated */ ++ char **argv; + +- /** Is 'argv' allocated? */ +- int allocated; ++ /** Is 'argv' allocated? */ ++ int allocated; + }; + + /** + * Initializer for 'struct fuse_args' + */ +-#define FUSE_ARGS_INIT(argc, argv) { argc, argv, 0 } ++#define FUSE_ARGS_INIT(argc, argv) \ ++ { \ ++ argc, argv, 0 \ ++ } + + /** + * Key value passed to the processing function if an option did not + * match any template + */ +-#define FUSE_OPT_KEY_OPT -1 ++#define FUSE_OPT_KEY_OPT -1 + + /** + * Key value passed to the processing function for all non-options +@@ -130,7 +139,7 @@ struct fuse_args { + * Non-options are the arguments beginning with a character other than + * '-' or all arguments after the special '--' option + */ +-#define FUSE_OPT_KEY_NONOPT -2 ++#define FUSE_OPT_KEY_NONOPT -2 + + /** + * Special key value for options to keep +@@ -174,7 +183,7 @@ struct fuse_args { + * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept + */ + typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, +- struct fuse_args *outargs); ++ struct fuse_args *outargs); + + /** + * Option parsing function +@@ -197,7 +206,7 @@ typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, + * @return -1 on error, 0 on success + */ + int fuse_opt_parse(struct fuse_args *args, void *data, +- const struct fuse_opt opts[], fuse_opt_proc_t proc); ++ const struct fuse_opt opts[], fuse_opt_proc_t proc); + + /** + * Add an option to a comma separated option list +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +index 4271947..19d6791 100644 +--- a/tools/virtiofsd/fuse_signals.c ++++ b/tools/virtiofsd/fuse_signals.c +@@ -1,91 +1,95 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- Utility functions for setting signal handlers. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Utility functions for setting signal handlers. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include "config.h" +-#include "fuse_lowlevel.h" + #include "fuse_i.h" ++#include "fuse_lowlevel.h" + +-#include +-#include + #include ++#include + #include ++#include + + static struct fuse_session *fuse_instance; + + static void exit_handler(int sig) + { +- if (fuse_instance) { +- fuse_session_exit(fuse_instance); +- if(sig <= 0) { +- fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); +- abort(); +- } +- fuse_instance->error = sig; +- } ++ if (fuse_instance) { ++ fuse_session_exit(fuse_instance); ++ if (sig <= 0) { ++ fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); ++ abort(); ++ } ++ fuse_instance->error = sig; ++ } + } + + static void do_nothing(int sig) + { +- (void) sig; ++ (void)sig; + } + + static int set_one_signal_handler(int sig, void (*handler)(int), int remove) + { +- struct sigaction sa; +- struct sigaction old_sa; ++ struct sigaction sa; ++ struct sigaction old_sa; + +- memset(&sa, 0, sizeof(struct sigaction)); +- sa.sa_handler = remove ? SIG_DFL : handler; +- sigemptyset(&(sa.sa_mask)); +- sa.sa_flags = 0; ++ memset(&sa, 0, sizeof(struct sigaction)); ++ sa.sa_handler = remove ? SIG_DFL : handler; ++ sigemptyset(&(sa.sa_mask)); ++ sa.sa_flags = 0; + +- if (sigaction(sig, NULL, &old_sa) == -1) { +- perror("fuse: cannot get old signal handler"); +- return -1; +- } ++ if (sigaction(sig, NULL, &old_sa) == -1) { ++ perror("fuse: cannot get old signal handler"); ++ return -1; ++ } + +- if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && +- sigaction(sig, &sa, NULL) == -1) { +- perror("fuse: cannot set signal handler"); +- return -1; +- } +- return 0; ++ if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && ++ sigaction(sig, &sa, NULL) == -1) { ++ perror("fuse: cannot set signal handler"); ++ return -1; ++ } ++ return 0; + } + + int fuse_set_signal_handlers(struct fuse_session *se) + { +- /* If we used SIG_IGN instead of the do_nothing function, +- then we would be unable to tell if we set SIG_IGN (and +- thus should reset to SIG_DFL in fuse_remove_signal_handlers) +- or if it was already set to SIG_IGN (and should be left +- untouched. */ +- if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || +- set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || +- set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || +- set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) +- return -1; ++ /* ++ * If we used SIG_IGN instead of the do_nothing function, ++ * then we would be unable to tell if we set SIG_IGN (and ++ * thus should reset to SIG_DFL in fuse_remove_signal_handlers) ++ * or if it was already set to SIG_IGN (and should be left ++ * untouched. ++ */ ++ if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) { ++ return -1; ++ } + +- fuse_instance = se; +- return 0; ++ fuse_instance = se; ++ return 0; + } + + void fuse_remove_signal_handlers(struct fuse_session *se) + { +- if (fuse_instance != se) +- fuse_log(FUSE_LOG_ERR, +- "fuse: fuse_remove_signal_handlers: unknown session\n"); +- else +- fuse_instance = NULL; ++ if (fuse_instance != se) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: fuse_remove_signal_handlers: unknown session\n"); ++ } else { ++ fuse_instance = NULL; ++ } + +- set_one_signal_handler(SIGHUP, exit_handler, 1); +- set_one_signal_handler(SIGINT, exit_handler, 1); +- set_one_signal_handler(SIGTERM, exit_handler, 1); +- set_one_signal_handler(SIGPIPE, do_nothing, 1); ++ set_one_signal_handler(SIGHUP, exit_handler, 1); ++ set_one_signal_handler(SIGINT, exit_handler, 1); ++ set_one_signal_handler(SIGTERM, exit_handler, 1); ++ set_one_signal_handler(SIGPIPE, do_nothing, 1); + } +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5a2e64c..5711dd2 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -1,297 +1,309 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Helper functions to create (simple) standalone programs. With the ++ * aid of these functions it should be possible to create full FUSE ++ * file system by implementing nothing but the request handlers. + +- Helper functions to create (simple) standalone programs. With the +- aid of these functions it should be possible to create full FUSE +- file system by implementing nothing but the request handlers. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #include "config.h" + #include "fuse_i.h" ++#include "fuse_lowlevel.h" + #include "fuse_misc.h" + #include "fuse_opt.h" +-#include "fuse_lowlevel.h" + #include "mount_util.h" + ++#include ++#include ++#include + #include + #include +-#include +-#include + #include +-#include +-#include + #include ++#include + +-#define FUSE_HELPER_OPT(t, p) \ +- { t, offsetof(struct fuse_cmdline_opts, p), 1 } ++#define FUSE_HELPER_OPT(t, p) \ ++ { \ ++ t, offsetof(struct fuse_cmdline_opts, p), 1 \ ++ } + + static const struct fuse_opt fuse_helper_opts[] = { +- FUSE_HELPER_OPT("-h", show_help), +- FUSE_HELPER_OPT("--help", show_help), +- FUSE_HELPER_OPT("-V", show_version), +- FUSE_HELPER_OPT("--version", show_version), +- FUSE_HELPER_OPT("-d", debug), +- FUSE_HELPER_OPT("debug", debug), +- FUSE_HELPER_OPT("-d", foreground), +- FUSE_HELPER_OPT("debug", foreground), +- FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), +- FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), +- FUSE_HELPER_OPT("-f", foreground), +- FUSE_HELPER_OPT("fsname=", nodefault_subtype), +- FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), +- FUSE_HELPER_OPT("subtype=", nodefault_subtype), +- FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), +- FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), +- FUSE_OPT_END ++ FUSE_HELPER_OPT("-h", show_help), ++ FUSE_HELPER_OPT("--help", show_help), ++ FUSE_HELPER_OPT("-V", show_version), ++ FUSE_HELPER_OPT("--version", show_version), ++ FUSE_HELPER_OPT("-d", debug), ++ FUSE_HELPER_OPT("debug", debug), ++ FUSE_HELPER_OPT("-d", foreground), ++ FUSE_HELPER_OPT("debug", foreground), ++ FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), ++ FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("-f", foreground), ++ FUSE_HELPER_OPT("fsname=", nodefault_subtype), ++ FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("subtype=", nodefault_subtype), ++ FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_OPT_END + }; + + struct fuse_conn_info_opts { +- int atomic_o_trunc; +- int no_remote_posix_lock; +- int no_remote_flock; +- int splice_write; +- int splice_move; +- int splice_read; +- int no_splice_write; +- int no_splice_move; +- int no_splice_read; +- int auto_inval_data; +- int no_auto_inval_data; +- int no_readdirplus; +- int no_readdirplus_auto; +- int async_dio; +- int no_async_dio; +- int writeback_cache; +- int no_writeback_cache; +- int async_read; +- int sync_read; +- unsigned max_write; +- unsigned max_readahead; +- unsigned max_background; +- unsigned congestion_threshold; +- unsigned time_gran; +- int set_max_write; +- int set_max_readahead; +- int set_max_background; +- int set_congestion_threshold; +- int set_time_gran; ++ int atomic_o_trunc; ++ int no_remote_posix_lock; ++ int no_remote_flock; ++ int splice_write; ++ int splice_move; ++ int splice_read; ++ int no_splice_write; ++ int no_splice_move; ++ int no_splice_read; ++ int auto_inval_data; ++ int no_auto_inval_data; ++ int no_readdirplus; ++ int no_readdirplus_auto; ++ int async_dio; ++ int no_async_dio; ++ int writeback_cache; ++ int no_writeback_cache; ++ int async_read; ++ int sync_read; ++ unsigned max_write; ++ unsigned max_readahead; ++ unsigned max_background; ++ unsigned congestion_threshold; ++ unsigned time_gran; ++ int set_max_write; ++ int set_max_readahead; ++ int set_max_background; ++ int set_congestion_threshold; ++ int set_time_gran; + }; + +-#define CONN_OPTION(t, p, v) \ +- { t, offsetof(struct fuse_conn_info_opts, p), v } ++#define CONN_OPTION(t, p, v) \ ++ { \ ++ t, offsetof(struct fuse_conn_info_opts, p), v \ ++ } + static const struct fuse_opt conn_info_opt_spec[] = { +- CONN_OPTION("max_write=%u", max_write, 0), +- CONN_OPTION("max_write=", set_max_write, 1), +- CONN_OPTION("max_readahead=%u", max_readahead, 0), +- CONN_OPTION("max_readahead=", set_max_readahead, 1), +- CONN_OPTION("max_background=%u", max_background, 0), +- CONN_OPTION("max_background=", set_max_background, 1), +- CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), +- CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), +- CONN_OPTION("sync_read", sync_read, 1), +- CONN_OPTION("async_read", async_read, 1), +- CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), +- CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), +- CONN_OPTION("no_remote_lock", no_remote_flock, 1), +- CONN_OPTION("no_remote_flock", no_remote_flock, 1), +- CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), +- CONN_OPTION("splice_write", splice_write, 1), +- CONN_OPTION("no_splice_write", no_splice_write, 1), +- CONN_OPTION("splice_move", splice_move, 1), +- CONN_OPTION("no_splice_move", no_splice_move, 1), +- CONN_OPTION("splice_read", splice_read, 1), +- CONN_OPTION("no_splice_read", no_splice_read, 1), +- CONN_OPTION("auto_inval_data", auto_inval_data, 1), +- CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), +- CONN_OPTION("readdirplus=no", no_readdirplus, 1), +- CONN_OPTION("readdirplus=yes", no_readdirplus, 0), +- CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), +- CONN_OPTION("readdirplus=auto", no_readdirplus, 0), +- CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), +- CONN_OPTION("async_dio", async_dio, 1), +- CONN_OPTION("no_async_dio", no_async_dio, 1), +- CONN_OPTION("writeback_cache", writeback_cache, 1), +- CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), +- CONN_OPTION("time_gran=%u", time_gran, 0), +- CONN_OPTION("time_gran=", set_time_gran, 1), +- FUSE_OPT_END ++ CONN_OPTION("max_write=%u", max_write, 0), ++ CONN_OPTION("max_write=", set_max_write, 1), ++ CONN_OPTION("max_readahead=%u", max_readahead, 0), ++ CONN_OPTION("max_readahead=", set_max_readahead, 1), ++ CONN_OPTION("max_background=%u", max_background, 0), ++ CONN_OPTION("max_background=", set_max_background, 1), ++ CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), ++ CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), ++ CONN_OPTION("sync_read", sync_read, 1), ++ CONN_OPTION("async_read", async_read, 1), ++ CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), ++ CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("no_remote_lock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_flock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("splice_write", splice_write, 1), ++ CONN_OPTION("no_splice_write", no_splice_write, 1), ++ CONN_OPTION("splice_move", splice_move, 1), ++ CONN_OPTION("no_splice_move", no_splice_move, 1), ++ CONN_OPTION("splice_read", splice_read, 1), ++ CONN_OPTION("no_splice_read", no_splice_read, 1), ++ CONN_OPTION("auto_inval_data", auto_inval_data, 1), ++ CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), ++ CONN_OPTION("readdirplus=no", no_readdirplus, 1), ++ CONN_OPTION("readdirplus=yes", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), ++ CONN_OPTION("readdirplus=auto", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), ++ CONN_OPTION("async_dio", async_dio, 1), ++ CONN_OPTION("no_async_dio", no_async_dio, 1), ++ CONN_OPTION("writeback_cache", writeback_cache, 1), ++ CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), ++ CONN_OPTION("time_gran=%u", time_gran, 0), ++ CONN_OPTION("time_gran=", set_time_gran, 1), ++ FUSE_OPT_END + }; + + + void fuse_cmdline_help(void) + { +- printf(" -h --help print help\n" +- " -V --version print version\n" +- " -d -o debug enable debug output (implies -f)\n" +- " -f foreground operation\n" +- " -o max_idle_threads the maximum number of idle worker threads\n" +- " allowed (default: 10)\n"); ++ printf( ++ " -h --help print help\n" ++ " -V --version print version\n" ++ " -d -o debug enable debug output (implies -f)\n" ++ " -f foreground operation\n" ++ " -o max_idle_threads the maximum number of idle worker threads\n" ++ " allowed (default: 10)\n"); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +- struct fuse_args *outargs) ++ struct fuse_args *outargs) + { +- (void) outargs; +- struct fuse_cmdline_opts *opts = data; +- +- switch (key) { +- case FUSE_OPT_KEY_NONOPT: +- if (!opts->mountpoint) { +- if (fuse_mnt_parse_fuse_fd(arg) != -1) { +- return fuse_opt_add_opt(&opts->mountpoint, arg); +- } +- +- char mountpoint[PATH_MAX] = ""; +- if (realpath(arg, mountpoint) == NULL) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: bad mount point `%s': %s\n", +- arg, strerror(errno)); +- return -1; +- } +- return fuse_opt_add_opt(&opts->mountpoint, mountpoint); +- } else { +- fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); +- return -1; +- } +- +- default: +- /* Pass through unknown options */ +- return 1; +- } ++ (void)outargs; ++ struct fuse_cmdline_opts *opts = data; ++ ++ switch (key) { ++ case FUSE_OPT_KEY_NONOPT: ++ if (!opts->mountpoint) { ++ if (fuse_mnt_parse_fuse_fd(arg) != -1) { ++ return fuse_opt_add_opt(&opts->mountpoint, arg); ++ } ++ ++ char mountpoint[PATH_MAX] = ""; ++ if (realpath(arg, mountpoint) == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: bad mount point `%s': %s\n", arg, ++ strerror(errno)); ++ return -1; ++ } ++ return fuse_opt_add_opt(&opts->mountpoint, mountpoint); ++ } else { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); ++ return -1; ++ } ++ ++ default: ++ /* Pass through unknown options */ ++ return 1; ++ } + } + +-int fuse_parse_cmdline(struct fuse_args *args, +- struct fuse_cmdline_opts *opts) ++int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + { +- memset(opts, 0, sizeof(struct fuse_cmdline_opts)); ++ memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + +- opts->max_idle_threads = 10; ++ opts->max_idle_threads = 10; + +- if (fuse_opt_parse(args, opts, fuse_helper_opts, +- fuse_helper_opt_proc) == -1) +- return -1; ++ if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == ++ -1) { ++ return -1; ++ } + +- return 0; ++ return 0; + } + + + int fuse_daemonize(int foreground) + { +- if (!foreground) { +- int nullfd; +- int waiter[2]; +- char completed; +- +- if (pipe(waiter)) { +- perror("fuse_daemonize: pipe"); +- return -1; +- } +- +- /* +- * demonize current process by forking it and killing the +- * parent. This makes current process as a child of 'init'. +- */ +- switch(fork()) { +- case -1: +- perror("fuse_daemonize: fork"); +- return -1; +- case 0: +- break; +- default: +- (void) read(waiter[0], &completed, sizeof(completed)); +- _exit(0); +- } +- +- if (setsid() == -1) { +- perror("fuse_daemonize: setsid"); +- return -1; +- } +- +- (void) chdir("/"); +- +- nullfd = open("/dev/null", O_RDWR, 0); +- if (nullfd != -1) { +- (void) dup2(nullfd, 0); +- (void) dup2(nullfd, 1); +- (void) dup2(nullfd, 2); +- if (nullfd > 2) +- close(nullfd); +- } +- +- /* Propagate completion of daemon initialization */ +- completed = 1; +- (void) write(waiter[1], &completed, sizeof(completed)); +- close(waiter[0]); +- close(waiter[1]); +- } else { +- (void) chdir("/"); +- } +- return 0; ++ if (!foreground) { ++ int nullfd; ++ int waiter[2]; ++ char completed; ++ ++ if (pipe(waiter)) { ++ perror("fuse_daemonize: pipe"); ++ return -1; ++ } ++ ++ /* ++ * demonize current process by forking it and killing the ++ * parent. This makes current process as a child of 'init'. ++ */ ++ switch (fork()) { ++ case -1: ++ perror("fuse_daemonize: fork"); ++ return -1; ++ case 0: ++ break; ++ default: ++ (void)read(waiter[0], &completed, sizeof(completed)); ++ _exit(0); ++ } ++ ++ if (setsid() == -1) { ++ perror("fuse_daemonize: setsid"); ++ return -1; ++ } ++ ++ (void)chdir("/"); ++ ++ nullfd = open("/dev/null", O_RDWR, 0); ++ if (nullfd != -1) { ++ (void)dup2(nullfd, 0); ++ (void)dup2(nullfd, 1); ++ (void)dup2(nullfd, 2); ++ if (nullfd > 2) { ++ close(nullfd); ++ } ++ } ++ ++ /* Propagate completion of daemon initialization */ ++ completed = 1; ++ (void)write(waiter[1], &completed, sizeof(completed)); ++ close(waiter[0]); ++ close(waiter[1]); ++ } else { ++ (void)chdir("/"); ++ } ++ return 0; + } + + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, +- struct fuse_conn_info *conn) ++ struct fuse_conn_info *conn) + { +- if(opts->set_max_write) +- conn->max_write = opts->max_write; +- if(opts->set_max_background) +- conn->max_background = opts->max_background; +- if(opts->set_congestion_threshold) +- conn->congestion_threshold = opts->congestion_threshold; +- if(opts->set_time_gran) +- conn->time_gran = opts->time_gran; +- if(opts->set_max_readahead) +- conn->max_readahead = opts->max_readahead; +- +-#define LL_ENABLE(cond,cap) \ +- if (cond) conn->want |= (cap) +-#define LL_DISABLE(cond,cap) \ +- if (cond) conn->want &= ~(cap) +- +- LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); +- LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); +- +- LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); +- LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); +- +- LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); +- LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); +- +- LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); +- LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); +- +- LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); +- LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); +- +- LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); +- LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); +- +- LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); +- LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); +- +- LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); +- LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); +- +- LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); +- LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); ++ if (opts->set_max_write) { ++ conn->max_write = opts->max_write; ++ } ++ if (opts->set_max_background) { ++ conn->max_background = opts->max_background; ++ } ++ if (opts->set_congestion_threshold) { ++ conn->congestion_threshold = opts->congestion_threshold; ++ } ++ if (opts->set_time_gran) { ++ conn->time_gran = opts->time_gran; ++ } ++ if (opts->set_max_readahead) { ++ conn->max_readahead = opts->max_readahead; ++ } ++ ++#define LL_ENABLE(cond, cap) \ ++ if (cond) \ ++ conn->want |= (cap) ++#define LL_DISABLE(cond, cap) \ ++ if (cond) \ ++ conn->want &= ~(cap) ++ ++ LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); ++ LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); ++ ++ LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); ++ LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); ++ ++ LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); ++ LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); ++ ++ LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ ++ LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); ++ ++ LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); ++ LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); ++ ++ LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ ++ LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); ++ LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); ++ ++ LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); ++ LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); + } + +-struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args) ++struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args) + { +- struct fuse_conn_info_opts *opts; +- +- opts = calloc(1, sizeof(struct fuse_conn_info_opts)); +- if(opts == NULL) { +- fuse_log(FUSE_LOG_ERR, "calloc failed\n"); +- return NULL; +- } +- if(fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { +- free(opts); +- return NULL; +- } +- return opts; ++ struct fuse_conn_info_opts *opts; ++ ++ opts = calloc(1, sizeof(struct fuse_conn_info_opts)); ++ if (opts == NULL) { ++ fuse_log(FUSE_LOG_ERR, "calloc failed\n"); ++ return NULL; ++ } ++ if (fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { ++ free(opts); ++ return NULL; ++ } ++ return opts; + } +diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h +index 7c5f561..0b98275 100644 +--- a/tools/virtiofsd/passthrough_helpers.h ++++ b/tools/virtiofsd/passthrough_helpers.h +@@ -28,23 +28,24 @@ + * operation + */ + static int mknod_wrapper(int dirfd, const char *path, const char *link, +- int mode, dev_t rdev) ++ int mode, dev_t rdev) + { +- int res; ++ int res; + +- if (S_ISREG(mode)) { +- res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); +- if (res >= 0) +- res = close(res); +- } else if (S_ISDIR(mode)) { +- res = mkdirat(dirfd, path, mode); +- } else if (S_ISLNK(mode) && link != NULL) { +- res = symlinkat(link, dirfd, path); +- } else if (S_ISFIFO(mode)) { +- res = mkfifoat(dirfd, path, mode); +- } else { +- res = mknodat(dirfd, path, mode, rdev); +- } ++ if (S_ISREG(mode)) { ++ res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); ++ if (res >= 0) { ++ res = close(res); ++ } ++ } else if (S_ISDIR(mode)) { ++ res = mkdirat(dirfd, path, mode); ++ } else if (S_ISLNK(mode) && link != NULL) { ++ res = symlinkat(link, dirfd, path); ++ } else if (S_ISFIFO(mode)) { ++ res = mkfifoat(dirfd, path, mode); ++ } else { ++ res = mknodat(dirfd, path, mode, rdev); ++ } + +- return res; ++ return res; + } +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e5f7115..c5850ef 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1,12 +1,12 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU GPLv2. +- See the file COPYING. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU GPLv2. ++ * See the file COPYING. ++ */ + +-/** @file ++/* + * + * This file system mirrors the existing file system hierarchy of the + * system, starting at the root file system. This is implemented by +@@ -28,7 +28,8 @@ + * + * Compile with: + * +- * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o passthrough_ll ++ * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o ++ * passthrough_ll + * + * ## Source code ## + * \include passthrough_ll.c +@@ -39,1299 +40,1365 @@ + + #include "config.h" + +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include + #include ++#include + #include ++#include + #include ++#include + #include ++#include ++#include ++#include ++#include ++#include + #include + #include ++#include + + #include "passthrough_helpers.h" + +-/* We are re-using pointers to our `struct lo_inode` and `struct +- lo_dirp` elements as inodes. This means that we must be able to +- store uintptr_t values in a fuse_ino_t variable. The following +- incantation checks this condition at compile time. */ +-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++/* ++ * We are re-using pointers to our `struct lo_inode` and `struct ++ * lo_dirp` elements as inodes. This means that we must be able to ++ * store uintptr_t values in a fuse_ino_t variable. The following ++ * incantation checks this condition at compile time. ++ */ ++#if defined(__GNUC__) && \ ++ (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ ++ !defined __cplusplus + _Static_assert(sizeof(fuse_ino_t) >= sizeof(uintptr_t), +- "fuse_ino_t too small to hold uintptr_t values!"); ++ "fuse_ino_t too small to hold uintptr_t values!"); + #else +-struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct \ +- { unsigned _uintptr_to_must_hold_fuse_ino_t: +- ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); }; ++struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { ++ unsigned _uintptr_to_must_hold_fuse_ino_t ++ : ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); ++}; + #endif + + struct lo_inode { +- struct lo_inode *next; /* protected by lo->mutex */ +- struct lo_inode *prev; /* protected by lo->mutex */ +- int fd; +- bool is_symlink; +- ino_t ino; +- dev_t dev; +- uint64_t refcount; /* protected by lo->mutex */ ++ struct lo_inode *next; /* protected by lo->mutex */ ++ struct lo_inode *prev; /* protected by lo->mutex */ ++ int fd; ++ bool is_symlink; ++ ino_t ino; ++ dev_t dev; ++ uint64_t refcount; /* protected by lo->mutex */ + }; + + enum { +- CACHE_NEVER, +- CACHE_NORMAL, +- CACHE_ALWAYS, ++ CACHE_NEVER, ++ CACHE_NORMAL, ++ CACHE_ALWAYS, + }; + + struct lo_data { +- pthread_mutex_t mutex; +- int debug; +- int writeback; +- int flock; +- int xattr; +- const char *source; +- double timeout; +- int cache; +- int timeout_set; +- struct lo_inode root; /* protected by lo->mutex */ ++ pthread_mutex_t mutex; ++ int debug; ++ int writeback; ++ int flock; ++ int xattr; ++ const char *source; ++ double timeout; ++ int cache; ++ int timeout_set; ++ struct lo_inode root; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +- { "writeback", +- offsetof(struct lo_data, writeback), 1 }, +- { "no_writeback", +- offsetof(struct lo_data, writeback), 0 }, +- { "source=%s", +- offsetof(struct lo_data, source), 0 }, +- { "flock", +- offsetof(struct lo_data, flock), 1 }, +- { "no_flock", +- offsetof(struct lo_data, flock), 0 }, +- { "xattr", +- offsetof(struct lo_data, xattr), 1 }, +- { "no_xattr", +- offsetof(struct lo_data, xattr), 0 }, +- { "timeout=%lf", +- offsetof(struct lo_data, timeout), 0 }, +- { "timeout=", +- offsetof(struct lo_data, timeout_set), 1 }, +- { "cache=never", +- offsetof(struct lo_data, cache), CACHE_NEVER }, +- { "cache=auto", +- offsetof(struct lo_data, cache), CACHE_NORMAL }, +- { "cache=always", +- offsetof(struct lo_data, cache), CACHE_ALWAYS }, +- +- FUSE_OPT_END ++ { "writeback", offsetof(struct lo_data, writeback), 1 }, ++ { "no_writeback", offsetof(struct lo_data, writeback), 0 }, ++ { "source=%s", offsetof(struct lo_data, source), 0 }, ++ { "flock", offsetof(struct lo_data, flock), 1 }, ++ { "no_flock", offsetof(struct lo_data, flock), 0 }, ++ { "xattr", offsetof(struct lo_data, xattr), 1 }, ++ { "no_xattr", offsetof(struct lo_data, xattr), 0 }, ++ { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, ++ { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, ++ { "cache=never", offsetof(struct lo_data, cache), CACHE_NEVER }, ++ { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, ++ { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, ++ ++ FUSE_OPT_END + }; + + static struct lo_data *lo_data(fuse_req_t req) + { +- return (struct lo_data *) fuse_req_userdata(req); ++ return (struct lo_data *)fuse_req_userdata(req); + } + + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { +- if (ino == FUSE_ROOT_ID) +- return &lo_data(req)->root; +- else +- return (struct lo_inode *) (uintptr_t) ino; ++ if (ino == FUSE_ROOT_ID) { ++ return &lo_data(req)->root; ++ } else { ++ return (struct lo_inode *)(uintptr_t)ino; ++ } + } + + static int lo_fd(fuse_req_t req, fuse_ino_t ino) + { +- return lo_inode(req, ino)->fd; ++ return lo_inode(req, ino)->fd; + } + + static bool lo_debug(fuse_req_t req) + { +- return lo_data(req)->debug != 0; ++ return lo_data(req)->debug != 0; + } + +-static void lo_init(void *userdata, +- struct fuse_conn_info *conn) ++static void lo_init(void *userdata, struct fuse_conn_info *conn) + { +- struct lo_data *lo = (struct lo_data*) userdata; +- +- if(conn->capable & FUSE_CAP_EXPORT_SUPPORT) +- conn->want |= FUSE_CAP_EXPORT_SUPPORT; +- +- if (lo->writeback && +- conn->capable & FUSE_CAP_WRITEBACK_CACHE) { +- if (lo->debug) +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); +- conn->want |= FUSE_CAP_WRITEBACK_CACHE; +- } +- if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { +- if (lo->debug) +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); +- conn->want |= FUSE_CAP_FLOCK_LOCKS; +- } ++ struct lo_data *lo = (struct lo_data *)userdata; ++ ++ if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) { ++ conn->want |= FUSE_CAP_EXPORT_SUPPORT; ++ } ++ ++ if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { ++ if (lo->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); ++ } ++ conn->want |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { ++ if (lo->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); ++ } ++ conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ } + } + + static void lo_getattr(fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- int res; +- struct stat buf; +- struct lo_data *lo = lo_data(req); ++ int res; ++ struct stat buf; ++ struct lo_data *lo = lo_data(req); + +- (void) fi; ++ (void)fi; + +- res = fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- return (void) fuse_reply_err(req, errno); ++ res = ++ fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } + +- fuse_reply_attr(req, &buf, lo->timeout); ++ fuse_reply_attr(req, &buf, lo->timeout); + } + + static int utimensat_empty_nofollow(struct lo_inode *inode, +- const struct timespec *tv) ++ const struct timespec *tv) + { +- int res; +- char procname[64]; +- +- if (inode->is_symlink) { +- res = utimensat(inode->fd, "", tv, +- AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1 && errno == EINVAL) { +- /* Sorry, no race free way to set times on symlink. */ +- errno = EPERM; +- } +- return res; +- } +- sprintf(procname, "/proc/self/fd/%i", inode->fd); +- +- return utimensat(AT_FDCWD, procname, tv, 0); ++ int res; ++ char procname[64]; ++ ++ if (inode->is_symlink) { ++ res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1 && errno == EINVAL) { ++ /* Sorry, no race free way to set times on symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ return utimensat(AT_FDCWD, procname, tv, 0); + } + + static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, +- int valid, struct fuse_file_info *fi) ++ int valid, struct fuse_file_info *fi) + { +- int saverr; +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- int ifd = inode->fd; +- int res; +- +- if (valid & FUSE_SET_ATTR_MODE) { +- if (fi) { +- res = fchmod(fi->fh, attr->st_mode); +- } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = chmod(procname, attr->st_mode); +- } +- if (res == -1) +- goto out_err; +- } +- if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { +- uid_t uid = (valid & FUSE_SET_ATTR_UID) ? +- attr->st_uid : (uid_t) -1; +- gid_t gid = (valid & FUSE_SET_ATTR_GID) ? +- attr->st_gid : (gid_t) -1; +- +- res = fchownat(ifd, "", uid, gid, +- AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- goto out_err; +- } +- if (valid & FUSE_SET_ATTR_SIZE) { +- if (fi) { +- res = ftruncate(fi->fh, attr->st_size); +- } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = truncate(procname, attr->st_size); +- } +- if (res == -1) +- goto out_err; +- } +- if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { +- struct timespec tv[2]; +- +- tv[0].tv_sec = 0; +- tv[1].tv_sec = 0; +- tv[0].tv_nsec = UTIME_OMIT; +- tv[1].tv_nsec = UTIME_OMIT; +- +- if (valid & FUSE_SET_ATTR_ATIME_NOW) +- tv[0].tv_nsec = UTIME_NOW; +- else if (valid & FUSE_SET_ATTR_ATIME) +- tv[0] = attr->st_atim; +- +- if (valid & FUSE_SET_ATTR_MTIME_NOW) +- tv[1].tv_nsec = UTIME_NOW; +- else if (valid & FUSE_SET_ATTR_MTIME) +- tv[1] = attr->st_mtim; +- +- if (fi) +- res = futimens(fi->fh, tv); +- else +- res = utimensat_empty_nofollow(inode, tv); +- if (res == -1) +- goto out_err; +- } +- +- return lo_getattr(req, ino, fi); ++ int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ int ifd = inode->fd; ++ int res; ++ ++ if (valid & FUSE_SET_ATTR_MODE) { ++ if (fi) { ++ res = fchmod(fi->fh, attr->st_mode); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = chmod(procname, attr->st_mode); ++ } ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { ++ uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1; ++ gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1; ++ ++ res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ if (valid & FUSE_SET_ATTR_SIZE) { ++ if (fi) { ++ res = ftruncate(fi->fh, attr->st_size); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = truncate(procname, attr->st_size); ++ } ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { ++ struct timespec tv[2]; ++ ++ tv[0].tv_sec = 0; ++ tv[1].tv_sec = 0; ++ tv[0].tv_nsec = UTIME_OMIT; ++ tv[1].tv_nsec = UTIME_OMIT; ++ ++ if (valid & FUSE_SET_ATTR_ATIME_NOW) { ++ tv[0].tv_nsec = UTIME_NOW; ++ } else if (valid & FUSE_SET_ATTR_ATIME) { ++ tv[0] = attr->st_atim; ++ } ++ ++ if (valid & FUSE_SET_ATTR_MTIME_NOW) { ++ tv[1].tv_nsec = UTIME_NOW; ++ } else if (valid & FUSE_SET_ATTR_MTIME) { ++ tv[1] = attr->st_mtim; ++ } ++ ++ if (fi) { ++ res = futimens(fi->fh, tv); ++ } else { ++ res = utimensat_empty_nofollow(inode, tv); ++ } ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ ++ return lo_getattr(req, ino, fi); + + out_err: +- saverr = errno; +- fuse_reply_err(req, saverr); ++ saverr = errno; ++ fuse_reply_err(req, saverr); + } + + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + { +- struct lo_inode *p; +- struct lo_inode *ret = NULL; +- +- pthread_mutex_lock(&lo->mutex); +- for (p = lo->root.next; p != &lo->root; p = p->next) { +- if (p->ino == st->st_ino && p->dev == st->st_dev) { +- assert(p->refcount > 0); +- ret = p; +- ret->refcount++; +- break; +- } +- } +- pthread_mutex_unlock(&lo->mutex); +- return ret; ++ struct lo_inode *p; ++ struct lo_inode *ret = NULL; ++ ++ pthread_mutex_lock(&lo->mutex); ++ for (p = lo->root.next; p != &lo->root; p = p->next) { ++ if (p->ino == st->st_ino && p->dev == st->st_dev) { ++ assert(p->refcount > 0); ++ ret = p; ++ ret->refcount++; ++ break; ++ } ++ } ++ pthread_mutex_unlock(&lo->mutex); ++ return ret; + } + + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, +- struct fuse_entry_param *e) ++ struct fuse_entry_param *e) + { +- int newfd; +- int res; +- int saverr; +- struct lo_data *lo = lo_data(req); +- struct lo_inode *inode; +- +- memset(e, 0, sizeof(*e)); +- e->attr_timeout = lo->timeout; +- e->entry_timeout = lo->timeout; +- +- newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); +- if (newfd == -1) +- goto out_err; +- +- res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- goto out_err; +- +- inode = lo_find(lo_data(req), &e->attr); +- if (inode) { +- close(newfd); +- newfd = -1; +- } else { +- struct lo_inode *prev, *next; +- +- saverr = ENOMEM; +- inode = calloc(1, sizeof(struct lo_inode)); +- if (!inode) +- goto out_err; +- +- inode->is_symlink = S_ISLNK(e->attr.st_mode); +- inode->refcount = 1; +- inode->fd = newfd; +- inode->ino = e->attr.st_ino; +- inode->dev = e->attr.st_dev; +- +- pthread_mutex_lock(&lo->mutex); +- prev = &lo->root; +- next = prev->next; +- next->prev = inode; +- inode->next = next; +- inode->prev = prev; +- prev->next = inode; +- pthread_mutex_unlock(&lo->mutex); +- } +- e->ino = (uintptr_t) inode; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long) parent, name, (unsigned long long) e->ino); +- +- return 0; ++ int newfd; ++ int res; ++ int saverr; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ ++ memset(e, 0, sizeof(*e)); ++ e->attr_timeout = lo->timeout; ++ e->entry_timeout = lo->timeout; ++ ++ newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); ++ if (newfd == -1) { ++ goto out_err; ++ } ++ ++ res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ goto out_err; ++ } ++ ++ inode = lo_find(lo_data(req), &e->attr); ++ if (inode) { ++ close(newfd); ++ newfd = -1; ++ } else { ++ struct lo_inode *prev, *next; ++ ++ saverr = ENOMEM; ++ inode = calloc(1, sizeof(struct lo_inode)); ++ if (!inode) { ++ goto out_err; ++ } ++ ++ inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ inode->refcount = 1; ++ inode->fd = newfd; ++ inode->ino = e->attr.st_ino; ++ inode->dev = e->attr.st_dev; ++ ++ pthread_mutex_lock(&lo->mutex); ++ prev = &lo->root; ++ next = prev->next; ++ next->prev = inode; ++ inode->next = next; ++ inode->prev = prev; ++ prev->next = inode; ++ pthread_mutex_unlock(&lo->mutex); ++ } ++ e->ino = (uintptr_t)inode; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long)parent, name, (unsigned long long)e->ino); ++ } ++ ++ return 0; + + out_err: +- saverr = errno; +- if (newfd != -1) +- close(newfd); +- return saverr; ++ saverr = errno; ++ if (newfd != -1) { ++ close(newfd); ++ } ++ return saverr; + } + + static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + { +- struct fuse_entry_param e; +- int err; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- +- err = lo_do_lookup(req, parent, name, &e); +- if (err) +- fuse_reply_err(req, err); +- else +- fuse_reply_entry(req, &e); ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ } ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_entry(req, &e); ++ } + } + + static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, +- const char *name, mode_t mode, dev_t rdev, +- const char *link) ++ const char *name, mode_t mode, dev_t rdev, ++ const char *link) + { +- int res; +- int saverr; +- struct lo_inode *dir = lo_inode(req, parent); +- struct fuse_entry_param e; ++ int res; ++ int saverr; ++ struct lo_inode *dir = lo_inode(req, parent); ++ struct fuse_entry_param e; + +- saverr = ENOMEM; ++ saverr = ENOMEM; + +- res = mknod_wrapper(dir->fd, name, link, mode, rdev); ++ res = mknod_wrapper(dir->fd, name, link, mode, rdev); + +- saverr = errno; +- if (res == -1) +- goto out; ++ saverr = errno; ++ if (res == -1) { ++ goto out; ++ } + +- saverr = lo_do_lookup(req, parent, name, &e); +- if (saverr) +- goto out; ++ saverr = lo_do_lookup(req, parent, name, &e); ++ if (saverr) { ++ goto out; ++ } + +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long) parent, name, (unsigned long long) e.ino); ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long)parent, name, (unsigned long long)e.ino); ++ } + +- fuse_reply_entry(req, &e); +- return; ++ fuse_reply_entry(req, &e); ++ return; + + out: +- fuse_reply_err(req, saverr); ++ fuse_reply_err(req, saverr); + } + +-static void lo_mknod(fuse_req_t req, fuse_ino_t parent, +- const char *name, mode_t mode, dev_t rdev) ++static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, dev_t rdev) + { +- lo_mknod_symlink(req, parent, name, mode, rdev, NULL); ++ lo_mknod_symlink(req, parent, name, mode, rdev, NULL); + } + + static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode) ++ mode_t mode) + { +- lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); ++ lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); + } + +-static void lo_symlink(fuse_req_t req, const char *link, +- fuse_ino_t parent, const char *name) ++static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, ++ const char *name) + { +- lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); ++ lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); + } + + static int linkat_empty_nofollow(struct lo_inode *inode, int dfd, +- const char *name) ++ const char *name) + { +- int res; +- char procname[64]; ++ int res; ++ char procname[64]; + +- if (inode->is_symlink) { +- res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); +- if (res == -1 && (errno == ENOENT || errno == EINVAL)) { +- /* Sorry, no race free way to hard-link a symlink. */ +- errno = EPERM; +- } +- return res; +- } ++ if (inode->is_symlink) { ++ res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); ++ if (res == -1 && (errno == ENOENT || errno == EINVAL)) { ++ /* Sorry, no race free way to hard-link a symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); + +- return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); ++ return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); + } + + static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, +- const char *name) ++ const char *name) + { +- int res; +- struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); +- struct fuse_entry_param e; +- int saverr; +- +- memset(&e, 0, sizeof(struct fuse_entry_param)); +- e.attr_timeout = lo->timeout; +- e.entry_timeout = lo->timeout; +- +- res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); +- if (res == -1) +- goto out_err; +- +- res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- goto out_err; +- +- pthread_mutex_lock(&lo->mutex); +- inode->refcount++; +- pthread_mutex_unlock(&lo->mutex); +- e.ino = (uintptr_t) inode; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long) parent, name, +- (unsigned long long) e.ino); +- +- fuse_reply_entry(req, &e); +- return; ++ int res; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ struct fuse_entry_param e; ++ int saverr; ++ ++ memset(&e, 0, sizeof(struct fuse_entry_param)); ++ e.attr_timeout = lo->timeout; ++ e.entry_timeout = lo->timeout; ++ ++ res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); ++ if (res == -1) { ++ goto out_err; ++ } ++ ++ res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ goto out_err; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ inode->refcount++; ++ pthread_mutex_unlock(&lo->mutex); ++ e.ino = (uintptr_t)inode; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long)parent, name, (unsigned long long)e.ino); ++ } ++ ++ fuse_reply_entry(req, &e); ++ return; + + out_err: +- saverr = errno; +- fuse_reply_err(req, saverr); ++ saverr = errno; ++ fuse_reply_err(req, saverr); + } + + static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + { +- int res; ++ int res; + +- res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); ++ res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, +- fuse_ino_t newparent, const char *newname, +- unsigned int flags) ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags) + { +- int res; ++ int res; + +- if (flags) { +- fuse_reply_err(req, EINVAL); +- return; +- } ++ if (flags) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + +- res = renameat(lo_fd(req, parent), name, +- lo_fd(req, newparent), newname); ++ res = renameat(lo_fd(req, parent), name, lo_fd(req, newparent), newname); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + { +- int res; ++ int res; + +- res = unlinkat(lo_fd(req, parent), name, 0); ++ res = unlinkat(lo_fd(req, parent), name, 0); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + { +- if (!inode) +- return; +- +- pthread_mutex_lock(&lo->mutex); +- assert(inode->refcount >= n); +- inode->refcount -= n; +- if (!inode->refcount) { +- struct lo_inode *prev, *next; +- +- prev = inode->prev; +- next = inode->next; +- next->prev = prev; +- prev->next = next; +- +- pthread_mutex_unlock(&lo->mutex); +- close(inode->fd); +- free(inode); +- +- } else { +- pthread_mutex_unlock(&lo->mutex); +- } ++ if (!inode) { ++ return; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ assert(inode->refcount >= n); ++ inode->refcount -= n; ++ if (!inode->refcount) { ++ struct lo_inode *prev, *next; ++ ++ prev = inode->prev; ++ next = inode->next; ++ next->prev = prev; ++ prev->next = next; ++ ++ pthread_mutex_unlock(&lo->mutex); ++ close(inode->fd); ++ free(inode); ++ ++ } else { ++ pthread_mutex_unlock(&lo->mutex); ++ } + } + + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { +- struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +- (unsigned long long) ino, +- (unsigned long long) inode->refcount, +- (unsigned long long) nlookup); +- } ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", ++ (unsigned long long)ino, (unsigned long long)inode->refcount, ++ (unsigned long long)nlookup); ++ } + +- unref_inode(lo, inode, nlookup); ++ unref_inode(lo, inode, nlookup); + } + + static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { +- lo_forget_one(req, ino, nlookup); +- fuse_reply_none(req); ++ lo_forget_one(req, ino, nlookup); ++ fuse_reply_none(req); + } + + static void lo_forget_multi(fuse_req_t req, size_t count, +- struct fuse_forget_data *forgets) ++ struct fuse_forget_data *forgets) + { +- int i; ++ int i; + +- for (i = 0; i < count; i++) +- lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); +- fuse_reply_none(req); ++ for (i = 0; i < count; i++) { ++ lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); ++ } ++ fuse_reply_none(req); + } + + static void lo_readlink(fuse_req_t req, fuse_ino_t ino) + { +- char buf[PATH_MAX + 1]; +- int res; ++ char buf[PATH_MAX + 1]; ++ int res; + +- res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); +- if (res == -1) +- return (void) fuse_reply_err(req, errno); ++ res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); ++ if (res == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } + +- if (res == sizeof(buf)) +- return (void) fuse_reply_err(req, ENAMETOOLONG); ++ if (res == sizeof(buf)) { ++ return (void)fuse_reply_err(req, ENAMETOOLONG); ++ } + +- buf[res] = '\0'; ++ buf[res] = '\0'; + +- fuse_reply_readlink(req, buf); ++ fuse_reply_readlink(req, buf); + } + + struct lo_dirp { +- DIR *dp; +- struct dirent *entry; +- off_t offset; ++ DIR *dp; ++ struct dirent *entry; ++ off_t offset; + }; + + static struct lo_dirp *lo_dirp(struct fuse_file_info *fi) + { +- return (struct lo_dirp *) (uintptr_t) fi->fh; ++ return (struct lo_dirp *)(uintptr_t)fi->fh; + } + +-static void lo_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++static void lo_opendir(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) + { +- int error = ENOMEM; +- struct lo_data *lo = lo_data(req); +- struct lo_dirp *d; +- int fd; +- +- d = calloc(1, sizeof(struct lo_dirp)); +- if (d == NULL) +- goto out_err; +- +- fd = openat(lo_fd(req, ino), ".", O_RDONLY); +- if (fd == -1) +- goto out_errno; +- +- d->dp = fdopendir(fd); +- if (d->dp == NULL) +- goto out_errno; +- +- d->offset = 0; +- d->entry = NULL; +- +- fi->fh = (uintptr_t) d; +- if (lo->cache == CACHE_ALWAYS) +- fi->keep_cache = 1; +- fuse_reply_open(req, fi); +- return; ++ int error = ENOMEM; ++ struct lo_data *lo = lo_data(req); ++ struct lo_dirp *d; ++ int fd; ++ ++ d = calloc(1, sizeof(struct lo_dirp)); ++ if (d == NULL) { ++ goto out_err; ++ } ++ ++ fd = openat(lo_fd(req, ino), ".", O_RDONLY); ++ if (fd == -1) { ++ goto out_errno; ++ } ++ ++ d->dp = fdopendir(fd); ++ if (d->dp == NULL) { ++ goto out_errno; ++ } ++ ++ d->offset = 0; ++ d->entry = NULL; ++ ++ fi->fh = (uintptr_t)d; ++ if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ fuse_reply_open(req, fi); ++ return; + + out_errno: +- error = errno; ++ error = errno; + out_err: +- if (d) { +- if (fd != -1) +- close(fd); +- free(d); +- } +- fuse_reply_err(req, error); ++ if (d) { ++ if (fd != -1) { ++ close(fd); ++ } ++ free(d); ++ } ++ fuse_reply_err(req, error); + } + + static int is_dot_or_dotdot(const char *name) + { +- return name[0] == '.' && (name[1] == '\0' || +- (name[1] == '.' && name[2] == '\0')); ++ return name[0] == '.' && ++ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); + } + + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi, int plus) ++ off_t offset, struct fuse_file_info *fi, int plus) + { +- struct lo_dirp *d = lo_dirp(fi); +- char *buf; +- char *p; +- size_t rem = size; +- int err; +- +- (void) ino; +- +- buf = calloc(1, size); +- if (!buf) { +- err = ENOMEM; +- goto error; +- } +- p = buf; +- +- if (offset != d->offset) { +- seekdir(d->dp, offset); +- d->entry = NULL; +- d->offset = offset; +- } +- while (1) { +- size_t entsize; +- off_t nextoff; +- const char *name; +- +- if (!d->entry) { +- errno = 0; +- d->entry = readdir(d->dp); +- if (!d->entry) { +- if (errno) { // Error +- err = errno; +- goto error; +- } else { // End of stream +- break; +- } +- } +- } +- nextoff = d->entry->d_off; +- name = d->entry->d_name; +- fuse_ino_t entry_ino = 0; +- if (plus) { +- struct fuse_entry_param e; +- if (is_dot_or_dotdot(name)) { +- e = (struct fuse_entry_param) { +- .attr.st_ino = d->entry->d_ino, +- .attr.st_mode = d->entry->d_type << 12, +- }; +- } else { +- err = lo_do_lookup(req, ino, name, &e); +- if (err) +- goto error; +- entry_ino = e.ino; +- } +- +- entsize = fuse_add_direntry_plus(req, p, rem, name, +- &e, nextoff); +- } else { +- struct stat st = { +- .st_ino = d->entry->d_ino, +- .st_mode = d->entry->d_type << 12, +- }; +- entsize = fuse_add_direntry(req, p, rem, name, +- &st, nextoff); +- } +- if (entsize > rem) { +- if (entry_ino != 0) +- lo_forget_one(req, entry_ino, 1); +- break; +- } +- +- p += entsize; +- rem -= entsize; +- +- d->entry = NULL; +- d->offset = nextoff; +- } ++ struct lo_dirp *d = lo_dirp(fi); ++ char *buf; ++ char *p; ++ size_t rem = size; ++ int err; ++ ++ (void)ino; ++ ++ buf = calloc(1, size); ++ if (!buf) { ++ err = ENOMEM; ++ goto error; ++ } ++ p = buf; ++ ++ if (offset != d->offset) { ++ seekdir(d->dp, offset); ++ d->entry = NULL; ++ d->offset = offset; ++ } ++ while (1) { ++ size_t entsize; ++ off_t nextoff; ++ const char *name; ++ ++ if (!d->entry) { ++ errno = 0; ++ d->entry = readdir(d->dp); ++ if (!d->entry) { ++ if (errno) { /* Error */ ++ err = errno; ++ goto error; ++ } else { /* End of stream */ ++ break; ++ } ++ } ++ } ++ nextoff = d->entry->d_off; ++ name = d->entry->d_name; ++ fuse_ino_t entry_ino = 0; ++ if (plus) { ++ struct fuse_entry_param e; ++ if (is_dot_or_dotdot(name)) { ++ e = (struct fuse_entry_param){ ++ .attr.st_ino = d->entry->d_ino, ++ .attr.st_mode = d->entry->d_type << 12, ++ }; ++ } else { ++ err = lo_do_lookup(req, ino, name, &e); ++ if (err) { ++ goto error; ++ } ++ entry_ino = e.ino; ++ } ++ ++ entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); ++ } else { ++ struct stat st = { ++ .st_ino = d->entry->d_ino, ++ .st_mode = d->entry->d_type << 12, ++ }; ++ entsize = fuse_add_direntry(req, p, rem, name, &st, nextoff); ++ } ++ if (entsize > rem) { ++ if (entry_ino != 0) { ++ lo_forget_one(req, entry_ino, 1); ++ } ++ break; ++ } ++ ++ p += entsize; ++ rem -= entsize; ++ ++ d->entry = NULL; ++ d->offset = nextoff; ++ } + + err = 0; + error: +- // If there's an error, we can only signal it if we haven't stored +- // any entries yet - otherwise we'd end up with wrong lookup +- // counts for the entries that are already in the buffer. So we +- // return what we've collected until that point. +- if (err && rem == size) +- fuse_reply_err(req, err); +- else +- fuse_reply_buf(req, buf, size - rem); ++ /* ++ * If there's an error, we can only signal it if we haven't stored ++ * any entries yet - otherwise we'd end up with wrong lookup ++ * counts for the entries that are already in the buffer. So we ++ * return what we've collected until that point. ++ */ ++ if (err && rem == size) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_buf(req, buf, size - rem); ++ } + free(buf); + } + + static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi) ++ off_t offset, struct fuse_file_info *fi) + { +- lo_do_readdir(req, ino, size, offset, fi, 0); ++ lo_do_readdir(req, ino, size, offset, fi, 0); + } + + static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi) ++ off_t offset, struct fuse_file_info *fi) + { +- lo_do_readdir(req, ino, size, offset, fi, 1); ++ lo_do_readdir(req, ino, size, offset, fi, 1); + } + +-static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) + { +- struct lo_dirp *d = lo_dirp(fi); +- (void) ino; +- closedir(d->dp); +- free(d); +- fuse_reply_err(req, 0); ++ struct lo_dirp *d = lo_dirp(fi); ++ (void)ino; ++ closedir(d->dp); ++ free(d); ++ fuse_reply_err(req, 0); + } + + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode, struct fuse_file_info *fi) ++ mode_t mode, struct fuse_file_info *fi) + { +- int fd; +- struct lo_data *lo = lo_data(req); +- struct fuse_entry_param e; +- int err; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- +- fd = openat(lo_fd(req, parent), name, +- (fi->flags | O_CREAT) & ~O_NOFOLLOW, mode); +- if (fd == -1) +- return (void) fuse_reply_err(req, errno); +- +- fi->fh = fd; +- if (lo->cache == CACHE_NEVER) +- fi->direct_io = 1; +- else if (lo->cache == CACHE_ALWAYS) +- fi->keep_cache = 1; +- +- err = lo_do_lookup(req, parent, name, &e); +- if (err) +- fuse_reply_err(req, err); +- else +- fuse_reply_create(req, &e, fi); ++ int fd; ++ struct lo_data *lo = lo_data(req); ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ } ++ ++ fd = openat(lo_fd(req, parent), name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, ++ mode); ++ if (fd == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) { ++ fi->direct_io = 1; ++ } else if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_create(req, &e, fi); ++ } + } + + static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- int res; +- int fd = dirfd(lo_dirp(fi)->dp); +- (void) ino; +- if (datasync) +- res = fdatasync(fd); +- else +- res = fsync(fd); +- fuse_reply_err(req, res == -1 ? errno : 0); ++ int res; ++ int fd = dirfd(lo_dirp(fi)->dp); ++ (void)ino; ++ if (datasync) { ++ res = fdatasync(fd); ++ } else { ++ res = fsync(fd); ++ } ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { +- int fd; +- char buf[64]; +- struct lo_data *lo = lo_data(req); +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", +- ino, fi->flags); +- +- /* With writeback cache, kernel may send read requests even +- when userspace opened write-only */ +- if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { +- fi->flags &= ~O_ACCMODE; +- fi->flags |= O_RDWR; +- } +- +- /* With writeback cache, O_APPEND is handled by the kernel. +- This breaks atomicity (since the file may change in the +- underlying filesystem, so that the kernel's idea of the +- end of the file isn't accurate anymore). In this example, +- we just accept that. A more rigorous filesystem may want +- to return an error here */ +- if (lo->writeback && (fi->flags & O_APPEND)) +- fi->flags &= ~O_APPEND; +- +- sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); +- fd = open(buf, fi->flags & ~O_NOFOLLOW); +- if (fd == -1) +- return (void) fuse_reply_err(req, errno); +- +- fi->fh = fd; +- if (lo->cache == CACHE_NEVER) +- fi->direct_io = 1; +- else if (lo->cache == CACHE_ALWAYS) +- fi->keep_cache = 1; +- fuse_reply_open(req, fi); ++ int fd; ++ char buf[64]; ++ struct lo_data *lo = lo_data(req); ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, ++ fi->flags); ++ } ++ ++ /* ++ * With writeback cache, kernel may send read requests even ++ * when userspace opened write-only ++ */ ++ if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { ++ fi->flags &= ~O_ACCMODE; ++ fi->flags |= O_RDWR; ++ } ++ ++ /* ++ * With writeback cache, O_APPEND is handled by the kernel. ++ * This breaks atomicity (since the file may change in the ++ * underlying filesystem, so that the kernel's idea of the ++ * end of the file isn't accurate anymore). In this example, ++ * we just accept that. A more rigorous filesystem may want ++ * to return an error here ++ */ ++ if (lo->writeback && (fi->flags & O_APPEND)) { ++ fi->flags &= ~O_APPEND; ++ } ++ ++ sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ fd = open(buf, fi->flags & ~O_NOFOLLOW); ++ if (fd == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) { ++ fi->direct_io = 1; ++ } else if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ fuse_reply_open(req, fi); + } + +-static void lo_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++static void lo_release(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) + { +- (void) ino; ++ (void)ino; + +- close(fi->fh); +- fuse_reply_err(req, 0); ++ close(fi->fh); ++ fuse_reply_err(req, 0); + } + + static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { +- int res; +- (void) ino; +- res = close(dup(fi->fh)); +- fuse_reply_err(req, res == -1 ? errno : 0); ++ int res; ++ (void)ino; ++ res = close(dup(fi->fh)); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- int res; +- (void) ino; +- if (datasync) +- res = fdatasync(fi->fh); +- else +- res = fsync(fi->fh); +- fuse_reply_err(req, res == -1 ? errno : 0); ++ int res; ++ (void)ino; ++ if (datasync) { ++ res = fdatasync(fi->fh); ++ } else { ++ res = fsync(fi->fh); ++ } ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + +-static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi) ++static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, ++ struct fuse_file_info *fi) + { +- struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); ++ struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); + +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_read(ino=%" PRIu64 ", size=%zd, " +- "off=%lu)\n", ino, size, (unsigned long) offset); ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_read(ino=%" PRIu64 ", size=%zd, " ++ "off=%lu)\n", ++ ino, size, (unsigned long)offset); ++ } + +- buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- buf.buf[0].fd = fi->fh; +- buf.buf[0].pos = offset; ++ buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ buf.buf[0].fd = fi->fh; ++ buf.buf[0].pos = offset; + +- fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); ++ fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); + } + + static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, +- struct fuse_bufvec *in_buf, off_t off, +- struct fuse_file_info *fi) ++ struct fuse_bufvec *in_buf, off_t off, ++ struct fuse_file_info *fi) + { +- (void) ino; +- ssize_t res; +- struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); +- +- out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- out_buf.buf[0].fd = fi->fh; +- out_buf.buf[0].pos = off; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", +- ino, out_buf.buf[0].size, (unsigned long) off); +- +- res = fuse_buf_copy(&out_buf, in_buf, 0); +- if(res < 0) +- fuse_reply_err(req, -res); +- else +- fuse_reply_write(req, (size_t) res); ++ (void)ino; ++ ssize_t res; ++ struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); ++ ++ out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ out_buf.buf[0].fd = fi->fh; ++ out_buf.buf[0].pos = off; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, ++ out_buf.buf[0].size, (unsigned long)off); ++ } ++ ++ res = fuse_buf_copy(&out_buf, in_buf, 0); ++ if (res < 0) { ++ fuse_reply_err(req, -res); ++ } else { ++ fuse_reply_write(req, (size_t)res); ++ } + } + + static void lo_statfs(fuse_req_t req, fuse_ino_t ino) + { +- int res; +- struct statvfs stbuf; +- +- res = fstatvfs(lo_fd(req, ino), &stbuf); +- if (res == -1) +- fuse_reply_err(req, errno); +- else +- fuse_reply_statfs(req, &stbuf); ++ int res; ++ struct statvfs stbuf; ++ ++ res = fstatvfs(lo_fd(req, ino), &stbuf); ++ if (res == -1) { ++ fuse_reply_err(req, errno); ++ } else { ++ fuse_reply_statfs(req, &stbuf); ++ } + } + +-static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, +- off_t offset, off_t length, struct fuse_file_info *fi) ++static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, ++ off_t length, struct fuse_file_info *fi) + { +- int err = EOPNOTSUPP; +- (void) ino; ++ int err = EOPNOTSUPP; ++ (void)ino; + + #ifdef HAVE_FALLOCATE +- err = fallocate(fi->fh, mode, offset, length); +- if (err < 0) +- err = errno; ++ err = fallocate(fi->fh, mode, offset, length); ++ if (err < 0) { ++ err = errno; ++ } + + #elif defined(HAVE_POSIX_FALLOCATE) +- if (mode) { +- fuse_reply_err(req, EOPNOTSUPP); +- return; +- } ++ if (mode) { ++ fuse_reply_err(req, EOPNOTSUPP); ++ return; ++ } + +- err = posix_fallocate(fi->fh, offset, length); ++ err = posix_fallocate(fi->fh, offset, length); + #endif + +- fuse_reply_err(req, err); ++ fuse_reply_err(req, err); + } + + static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, +- int op) ++ int op) + { +- int res; +- (void) ino; ++ int res; ++ (void)ino; + +- res = flock(fi->fh, op); ++ res = flock(fi->fh, op); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, +- size_t size) ++ size_t size) + { +- char *value = NULL; +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; +- +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; +- +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", +- ino, name, size); +- } +- +- if (inode->is_symlink) { +- /* Sorry, no race free way to getxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- +- sprintf(procname, "/proc/self/fd/%i", inode->fd); +- +- if (size) { +- value = malloc(size); +- if (!value) +- goto out_err; +- +- ret = getxattr(procname, name, value, size); +- if (ret == -1) +- goto out_err; +- saverr = 0; +- if (ret == 0) +- goto out; +- +- fuse_reply_buf(req, value, ret); +- } else { +- ret = getxattr(procname, name, NULL, 0); +- if (ret == -1) +- goto out_err; +- +- fuse_reply_xattr(req, ret); +- } ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ino, name, ++ size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to getxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ ++ ret = getxattr(procname, name, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } ++ saverr = 0; ++ if (ret == 0) { ++ goto out; ++ } ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = getxattr(procname, name, NULL, 0); ++ if (ret == -1) { ++ goto out_err; ++ } ++ ++ fuse_reply_xattr(req, ret); ++ } + out_free: +- free(value); +- return; ++ free(value); ++ return; + + out_err: +- saverr = errno; ++ saverr = errno; + out: +- fuse_reply_err(req, saverr); +- goto out_free; ++ fuse_reply_err(req, saverr); ++ goto out_free; + } + + static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + { +- char *value = NULL; +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; +- +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; +- +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", +- ino, size); +- } +- +- if (inode->is_symlink) { +- /* Sorry, no race free way to listxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- +- sprintf(procname, "/proc/self/fd/%i", inode->fd); +- +- if (size) { +- value = malloc(size); +- if (!value) +- goto out_err; +- +- ret = listxattr(procname, value, size); +- if (ret == -1) +- goto out_err; +- saverr = 0; +- if (ret == 0) +- goto out; +- +- fuse_reply_buf(req, value, ret); +- } else { +- ret = listxattr(procname, NULL, 0); +- if (ret == -1) +- goto out_err; +- +- fuse_reply_xattr(req, ret); +- } ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ++ ino, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to listxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ ++ ret = listxattr(procname, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } ++ saverr = 0; ++ if (ret == 0) { ++ goto out; ++ } ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = listxattr(procname, NULL, 0); ++ if (ret == -1) { ++ goto out_err; ++ } ++ ++ fuse_reply_xattr(req, ret); ++ } + out_free: +- free(value); +- return; ++ free(value); ++ return; + + out_err: +- saverr = errno; ++ saverr = errno; + out: +- fuse_reply_err(req, saverr); +- goto out_free; ++ fuse_reply_err(req, saverr); ++ goto out_free; + } + + static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, +- const char *value, size_t size, int flags) ++ const char *value, size_t size, int flags) + { +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; + +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", +- ino, name, value, size); +- } ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", ++ ino, name, value, size); ++ } + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); + +- ret = setxattr(procname, name, value, size, flags); +- saverr = ret == -1 ? errno : 0; ++ ret = setxattr(procname, name, value, size, flags); ++ saverr = ret == -1 ? errno : 0; + + out: +- fuse_reply_err(req, saverr); ++ fuse_reply_err(req, saverr); + } + + static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + { +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; + +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", +- ino, name); +- } ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ++ ino, name); ++ } + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); + +- ret = removexattr(procname, name); +- saverr = ret == -1 ? errno : 0; ++ ret = removexattr(procname, name); ++ saverr = ret == -1 ? errno : 0; + + out: +- fuse_reply_err(req, saverr); ++ fuse_reply_err(req, saverr); + } + + #ifdef HAVE_COPY_FILE_RANGE + static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, +- struct fuse_file_info *fi_in, +- fuse_ino_t ino_out, off_t off_out, +- struct fuse_file_info *fi_out, size_t len, +- int flags) ++ struct fuse_file_info *fi_in, fuse_ino_t ino_out, ++ off_t off_out, struct fuse_file_info *fi_out, ++ size_t len, int flags) + { +- ssize_t res; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, size=%zd, flags=0x%x)\n", +- ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, +- len, flags); +- +- res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, +- flags); +- if (res < 0) +- fuse_reply_err(req, -errno); +- else +- fuse_reply_write(req, res); ++ ssize_t res; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, size=%zd, flags=0x%x)\n", ++ ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, len, ++ flags); ++ ++ res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, flags); ++ if (res < 0) { ++ fuse_reply_err(req, -errno); ++ } else { ++ fuse_reply_write(req, res); ++ } + } + #endif + + static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- off_t res; +- +- (void)ino; +- res = lseek(fi->fh, off, whence); +- if (res != -1) +- fuse_reply_lseek(req, res); +- else +- fuse_reply_err(req, errno); ++ off_t res; ++ ++ (void)ino; ++ res = lseek(fi->fh, off, whence); ++ if (res != -1) { ++ fuse_reply_lseek(req, res); ++ } else { ++ fuse_reply_err(req, errno); ++ } + } + + static struct fuse_lowlevel_ops lo_oper = { +- .init = lo_init, +- .lookup = lo_lookup, +- .mkdir = lo_mkdir, +- .mknod = lo_mknod, +- .symlink = lo_symlink, +- .link = lo_link, +- .unlink = lo_unlink, +- .rmdir = lo_rmdir, +- .rename = lo_rename, +- .forget = lo_forget, +- .forget_multi = lo_forget_multi, +- .getattr = lo_getattr, +- .setattr = lo_setattr, +- .readlink = lo_readlink, +- .opendir = lo_opendir, +- .readdir = lo_readdir, +- .readdirplus = lo_readdirplus, +- .releasedir = lo_releasedir, +- .fsyncdir = lo_fsyncdir, +- .create = lo_create, +- .open = lo_open, +- .release = lo_release, +- .flush = lo_flush, +- .fsync = lo_fsync, +- .read = lo_read, +- .write_buf = lo_write_buf, +- .statfs = lo_statfs, +- .fallocate = lo_fallocate, +- .flock = lo_flock, +- .getxattr = lo_getxattr, +- .listxattr = lo_listxattr, +- .setxattr = lo_setxattr, +- .removexattr = lo_removexattr, ++ .init = lo_init, ++ .lookup = lo_lookup, ++ .mkdir = lo_mkdir, ++ .mknod = lo_mknod, ++ .symlink = lo_symlink, ++ .link = lo_link, ++ .unlink = lo_unlink, ++ .rmdir = lo_rmdir, ++ .rename = lo_rename, ++ .forget = lo_forget, ++ .forget_multi = lo_forget_multi, ++ .getattr = lo_getattr, ++ .setattr = lo_setattr, ++ .readlink = lo_readlink, ++ .opendir = lo_opendir, ++ .readdir = lo_readdir, ++ .readdirplus = lo_readdirplus, ++ .releasedir = lo_releasedir, ++ .fsyncdir = lo_fsyncdir, ++ .create = lo_create, ++ .open = lo_open, ++ .release = lo_release, ++ .flush = lo_flush, ++ .fsync = lo_fsync, ++ .read = lo_read, ++ .write_buf = lo_write_buf, ++ .statfs = lo_statfs, ++ .fallocate = lo_fallocate, ++ .flock = lo_flock, ++ .getxattr = lo_getxattr, ++ .listxattr = lo_listxattr, ++ .setxattr = lo_setxattr, ++ .removexattr = lo_removexattr, + #ifdef HAVE_COPY_FILE_RANGE +- .copy_file_range = lo_copy_file_range, ++ .copy_file_range = lo_copy_file_range, + #endif +- .lseek = lo_lseek, ++ .lseek = lo_lseek, + }; + + int main(int argc, char *argv[]) + { +- struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +- struct fuse_session *se; +- struct fuse_cmdline_opts opts; +- struct lo_data lo = { .debug = 0, +- .writeback = 0 }; +- int ret = -1; +- +- /* Don't mask creation mode, kernel already did that */ +- umask(0); +- +- pthread_mutex_init(&lo.mutex, NULL); +- lo.root.next = lo.root.prev = &lo.root; +- lo.root.fd = -1; +- lo.cache = CACHE_NORMAL; +- +- if (fuse_parse_cmdline(&args, &opts) != 0) +- return 1; +- if (opts.show_help) { +- printf("usage: %s [options] \n\n", argv[0]); +- fuse_cmdline_help(); +- fuse_lowlevel_help(); +- ret = 0; +- goto err_out1; +- } else if (opts.show_version) { +- fuse_lowlevel_version(); +- ret = 0; +- goto err_out1; +- } +- +- if(opts.mountpoint == NULL) { +- printf("usage: %s [options] \n", argv[0]); +- printf(" %s --help\n", argv[0]); +- ret = 1; +- goto err_out1; +- } +- +- if (fuse_opt_parse(&args, &lo, lo_opts, NULL)== -1) +- return 1; +- +- lo.debug = opts.debug; +- lo.root.refcount = 2; +- if (lo.source) { +- struct stat stat; +- int res; +- +- res = lstat(lo.source, &stat); +- if (res == -1) { +- fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", +- lo.source); +- exit(1); +- } +- if (!S_ISDIR(stat.st_mode)) { +- fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); +- exit(1); +- } +- +- } else { +- lo.source = "/"; +- } +- lo.root.is_symlink = false; +- if (!lo.timeout_set) { +- switch (lo.cache) { +- case CACHE_NEVER: +- lo.timeout = 0.0; +- break; +- +- case CACHE_NORMAL: +- lo.timeout = 1.0; +- break; +- +- case CACHE_ALWAYS: +- lo.timeout = 86400.0; +- break; +- } +- } else if (lo.timeout < 0) { +- fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", +- lo.timeout); +- exit(1); +- } +- +- lo.root.fd = open(lo.source, O_PATH); +- if (lo.root.fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", +- lo.source); +- exit(1); +- } +- +- se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); +- if (se == NULL) +- goto err_out1; +- +- if (fuse_set_signal_handlers(se) != 0) +- goto err_out2; +- +- if (fuse_session_mount(se, opts.mountpoint) != 0) +- goto err_out3; +- +- fuse_daemonize(opts.foreground); +- +- /* Block until ctrl+c or fusermount -u */ +- if (opts.singlethread) +- ret = fuse_session_loop(se); +- else +- ret = fuse_session_loop_mt(se, opts.clone_fd); +- +- fuse_session_unmount(se); ++ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); ++ struct fuse_session *se; ++ struct fuse_cmdline_opts opts; ++ struct lo_data lo = { .debug = 0, .writeback = 0 }; ++ int ret = -1; ++ ++ /* Don't mask creation mode, kernel already did that */ ++ umask(0); ++ ++ pthread_mutex_init(&lo.mutex, NULL); ++ lo.root.next = lo.root.prev = &lo.root; ++ lo.root.fd = -1; ++ lo.cache = CACHE_NORMAL; ++ ++ if (fuse_parse_cmdline(&args, &opts) != 0) { ++ return 1; ++ } ++ if (opts.show_help) { ++ printf("usage: %s [options] \n\n", argv[0]); ++ fuse_cmdline_help(); ++ fuse_lowlevel_help(); ++ ret = 0; ++ goto err_out1; ++ } else if (opts.show_version) { ++ fuse_lowlevel_version(); ++ ret = 0; ++ goto err_out1; ++ } ++ ++ if (opts.mountpoint == NULL) { ++ printf("usage: %s [options] \n", argv[0]); ++ printf(" %s --help\n", argv[0]); ++ ret = 1; ++ goto err_out1; ++ } ++ ++ if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { ++ return 1; ++ } ++ ++ lo.debug = opts.debug; ++ lo.root.refcount = 2; ++ if (lo.source) { ++ struct stat stat; ++ int res; ++ ++ res = lstat(lo.source, &stat); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", ++ lo.source); ++ exit(1); ++ } ++ if (!S_ISDIR(stat.st_mode)) { ++ fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); ++ exit(1); ++ } ++ ++ } else { ++ lo.source = "/"; ++ } ++ lo.root.is_symlink = false; ++ if (!lo.timeout_set) { ++ switch (lo.cache) { ++ case CACHE_NEVER: ++ lo.timeout = 0.0; ++ break; ++ ++ case CACHE_NORMAL: ++ lo.timeout = 1.0; ++ break; ++ ++ case CACHE_ALWAYS: ++ lo.timeout = 86400.0; ++ break; ++ } ++ } else if (lo.timeout < 0) { ++ fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout); ++ exit(1); ++ } ++ ++ lo.root.fd = open(lo.source, O_PATH); ++ if (lo.root.fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", lo.source); ++ exit(1); ++ } ++ ++ se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); ++ if (se == NULL) { ++ goto err_out1; ++ } ++ ++ if (fuse_set_signal_handlers(se) != 0) { ++ goto err_out2; ++ } ++ ++ if (fuse_session_mount(se, opts.mountpoint) != 0) { ++ goto err_out3; ++ } ++ ++ fuse_daemonize(opts.foreground); ++ ++ /* Block until ctrl+c or fusermount -u */ ++ if (opts.singlethread) { ++ ret = fuse_session_loop(se); ++ } else { ++ ret = fuse_session_loop_mt(se, opts.clone_fd); ++ } ++ ++ fuse_session_unmount(se); + err_out3: +- fuse_remove_signal_handlers(se); ++ fuse_remove_signal_handlers(se); + err_out2: +- fuse_session_destroy(se); ++ fuse_session_destroy(se); + err_out1: +- free(opts.mountpoint); +- fuse_opt_free_args(&args); ++ free(opts.mountpoint); ++ fuse_opt_free_args(&args); + +- if (lo.root.fd >= 0) +- close(lo.root.fd); ++ if (lo.root.fd >= 0) { ++ close(lo.root.fd); ++ } + +- return ret ? 1 : 0; ++ return ret ? 1 : 0; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Handle-hard-reboot.patch b/kvm-virtiofsd-Handle-hard-reboot.patch new file mode 100755 index 0000000..8888030 --- /dev/null +++ b/kvm-virtiofsd-Handle-hard-reboot.patch @@ -0,0 +1,65 @@ +From 616407b06517361ce444dcc0960aeaf55b52da33 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:41 +0100 +Subject: [PATCH 070/116] virtiofsd: Handle hard reboot +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-67-dgilbert@redhat.com> +Patchwork-id: 93521 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 066/112] virtiofsd: Handle hard reboot +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Handle a + mount + hard reboot (without unmount) + mount + +we get another 'init' which FUSE doesn't normally expect. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e8556f49098b5d95634e592d79a97f761b76c96e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 7d742b5..65f91da 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2433,7 +2433,21 @@ void fuse_session_process_buf_int(struct fuse_session *se, + goto reply_err; + } + } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) { +- goto reply_err; ++ if (fuse_lowlevel_is_virtio(se)) { ++ /* ++ * TODO: This is after a hard reboot typically, we need to do ++ * a destroy, but we can't reply to this request yet so ++ * we can't use do_destroy ++ */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: reinit\n", __func__); ++ se->got_destroy = 1; ++ se->got_init = 0; ++ if (se->op.destroy) { ++ se->op.destroy(se->userdata); ++ } ++ } else { ++ goto reply_err; ++ } + } + + err = EACCES; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Handle-reinit.patch b/kvm-virtiofsd-Handle-reinit.patch new file mode 100755 index 0000000..3f9577b --- /dev/null +++ b/kvm-virtiofsd-Handle-reinit.patch @@ -0,0 +1,53 @@ +From 485adfa1aa1b3e2d1449edf5c42d6ec396cbfb5d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:40 +0100 +Subject: [PATCH 069/116] virtiofsd: Handle reinit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-66-dgilbert@redhat.com> +Patchwork-id: 93520 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 065/112] virtiofsd: Handle reinit +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Allow init->destroy->init for mount->umount->mount + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c806d6435fe95fd54b379920aca2f4e3ea1f3258) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index a7a1968..7d742b5 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2028,6 +2028,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + } + + se->got_init = 1; ++ se->got_destroy = 0; + if (se->op.init) { + se->op.init(se->userdata, &se->conn); + } +@@ -2130,6 +2131,7 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, + (void)iter; + + se->got_destroy = 1; ++ se->got_init = 0; + if (se->op.destroy) { + se->op.destroy(se->userdata); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Keep-track-of-replies.patch b/kvm-virtiofsd-Keep-track-of-replies.patch new file mode 100755 index 0000000..18be3e0 --- /dev/null +++ b/kvm-virtiofsd-Keep-track-of-replies.patch @@ -0,0 +1,116 @@ +From c818a1cb603cad07aa5c49ce808aa09435667c7c Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:04 +0100 +Subject: [PATCH 033/116] virtiofsd: Keep track of replies +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-30-dgilbert@redhat.com> +Patchwork-id: 93481 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 029/112] virtiofsd: Keep track of replies +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Keep track of whether we sent a reply to a request; this is a bit +paranoid but it means: + a) We should always recycle an element even if there was an error + in the request + b) Never try and send two replies on one queue element + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2f65e69a7f22da8d20c747f34f339ebb40a0634f) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 05d0e29..f1adeb6 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -44,6 +44,7 @@ struct fv_QueueInfo { + + /* The element for the command currently being processed */ + VuVirtqElement *qe; ++ bool reply_sent; + }; + + /* +@@ -178,6 +179,7 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + { + VuVirtqElement *elem; + VuVirtq *q; ++ int ret = 0; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); +@@ -191,6 +193,7 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + assert(out->unique); + /* For virtio we always have ch */ + assert(ch); ++ assert(!ch->qi->reply_sent); + elem = ch->qi->qe; + q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; + +@@ -208,19 +211,23 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + if (in_len < sizeof(struct fuse_out_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", + __func__, elem->index); +- return -E2BIG; ++ ret = -E2BIG; ++ goto err; + } + if (in_len < tosend_len) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", + __func__, elem->index, tosend_len); +- return -E2BIG; ++ ret = -E2BIG; ++ goto err; + } + + copy_iov(iov, count, in_sg, in_num, tosend_len); + vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); + vu_queue_notify(&se->virtio_dev->dev, q); ++ ch->qi->reply_sent = true; + +- return 0; ++err: ++ return ret; + } + + /* Thread function for individual queues, created when a queue is 'started' */ +@@ -296,6 +303,9 @@ static void *fv_queue_thread(void *opaque) + break; + } + ++ qi->qe = elem; ++ qi->reply_sent = false; ++ + if (!fbuf.mem) { + fbuf.mem = malloc(se->bufsize); + assert(fbuf.mem); +@@ -331,6 +341,13 @@ static void *fv_queue_thread(void *opaque) + /* TODO: Add checks for fuse_session_exited */ + fuse_session_process_buf_int(se, &fbuf, &ch); + ++ if (!qi->reply_sent) { ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", ++ __func__, elem->index); ++ /* I think we've still got to recycle the element */ ++ vu_queue_push(dev, q, elem, 0); ++ vu_queue_notify(dev, q); ++ } + qi->qe = NULL; + free(elem); + elem = NULL; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch b/kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch new file mode 100755 index 0000000..5e054f3 --- /dev/null +++ b/kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch @@ -0,0 +1,143 @@ +From b37344c38b866c7e7fb773b4a3172a39306bac7e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:42 +0100 +Subject: [PATCH 071/116] virtiofsd: Kill threads when queues are stopped +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-68-dgilbert@redhat.com> +Patchwork-id: 93522 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 067/112] virtiofsd: Kill threads when queues are stopped +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Kill the threads we've started when the queues get stopped. + +Signed-off-by: Dr. David Alan Gilbert +With improvements by: +Signed-off-by: Eryu Guan +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 10477ac47fc57d00a84802ff97c15450cd8021c1) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 51 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 44 insertions(+), 7 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 872968f..7a8774a 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -41,6 +41,7 @@ struct fv_QueueInfo { + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; ++ int kill_fd; /* For killing the thread */ + + /* The element for the command currently being processed */ + VuVirtqElement *qe; +@@ -412,14 +413,17 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { +- struct pollfd pf[1]; ++ struct pollfd pf[2]; + pf[0].fd = qi->kick_fd; + pf[0].events = POLLIN; + pf[0].revents = 0; ++ pf[1].fd = qi->kill_fd; ++ pf[1].events = POLLIN; ++ pf[1].revents = 0; + + fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__, + qi->qidx); +- int poll_res = ppoll(pf, 1, NULL, NULL); ++ int poll_res = ppoll(pf, 2, NULL, NULL); + + if (poll_res == -1) { + if (errno == EINTR) { +@@ -430,12 +434,23 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n"); + break; + } +- assert(poll_res == 1); ++ assert(poll_res >= 1); + if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n", + __func__, pf[0].revents, qi->qidx); + break; + } ++ if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) { ++ fuse_log(FUSE_LOG_ERR, ++ "%s: Unexpected poll revents %x Queue %d killfd\n", ++ __func__, pf[1].revents, qi->qidx); ++ break; ++ } ++ if (pf[1].revents) { ++ fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n", ++ __func__, qi->qidx); ++ break; ++ } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__, + qi->qidx); +@@ -589,6 +604,28 @@ out: + return NULL; + } + ++static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) ++{ ++ int ret; ++ struct fv_QueueInfo *ourqi; ++ ++ assert(qidx < vud->nqueues); ++ ourqi = vud->qi[qidx]; ++ ++ /* Kill the thread */ ++ if (eventfd_write(ourqi->kill_fd, 1)) { ++ fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n", ++ qidx, strerror(errno)); ++ } ++ ret = pthread_join(ourqi->thread, NULL); ++ if (ret) { ++ fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n", ++ __func__, qidx, ret); ++ } ++ close(ourqi->kill_fd); ++ ourqi->kick_fd = -1; ++} ++ + /* Callback from libvhost-user on start or stop of a queue */ + static void fv_queue_set_started(VuDev *dev, int qidx, bool started) + { +@@ -633,16 +670,16 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started) + } + ourqi = vud->qi[qidx]; + ourqi->kick_fd = dev->vq[qidx].kick_fd; ++ ++ ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE); ++ assert(ourqi->kill_fd != -1); + if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", + __func__, qidx); + assert(0); + } + } else { +- /* TODO: Kill the thread */ +- assert(qidx < vud->nqueues); +- ourqi = vud->qi[qidx]; +- ourqi->kick_fd = -1; ++ fv_queue_cleanup_thread(vud, qidx); + } + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch b/kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch new file mode 100755 index 0000000..98211cb --- /dev/null +++ b/kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch @@ -0,0 +1,96 @@ +From f09f13f9a001a50ee3465c165f4bbaf870fcadb9 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:53 +0100 +Subject: [PATCH 022/116] virtiofsd: Make fsync work even if only inode is + passed in +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-19-dgilbert@redhat.com> +Patchwork-id: 93472 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 018/112] virtiofsd: Make fsync work even if only inode is passed in +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +If caller has not sent file handle in request, then using inode, retrieve +the fd opened using O_PATH and use that to open file again and issue +fsync. This will be needed when dax_flush() calls fsync. At that time +we only have inode information (and not file). + +Signed-off-by: Vivek Goyal +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 1b209805f8159c3f4d89ddb9390a5f64887cebff) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 6 +++++- + tools/virtiofsd/passthrough_ll.c | 28 ++++++++++++++++++++++++++-- + 2 files changed, 31 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 514d79c..8552cfb 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1075,7 +1075,11 @@ static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + fi.fh = arg->fh; + + if (req->se->op.fsync) { +- req->se->op.fsync(req, nodeid, datasync, &fi); ++ if (fi.fh == (uint64_t)-1) { ++ req->se->op.fsync(req, nodeid, datasync, NULL); ++ } else { ++ req->se->op.fsync(req, nodeid, datasync, &fi); ++ } + } else { + fuse_reply_err(req, ENOSYS); + } +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 6c4da18..26ac870 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -903,10 +903,34 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + { + int res; + (void)ino; ++ int fd; ++ char *buf; ++ ++ fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, ++ (void *)fi); ++ ++ if (!fi) { ++ res = asprintf(&buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ if (res == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ ++ fd = open(buf, O_RDWR); ++ free(buf); ++ if (fd == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ } else { ++ fd = fi->fh; ++ } ++ + if (datasync) { +- res = fdatasync(fi->fh); ++ res = fdatasync(fd); + } else { +- res = fsync(fi->fh); ++ res = fsync(fd); ++ } ++ if (!fi) { ++ close(fd); + } + fuse_reply_err(req, res == -1 ? errno : 0); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch b/kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch new file mode 100755 index 0000000..2c9874d --- /dev/null +++ b/kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch @@ -0,0 +1,257 @@ +From a96042f05eaf494fbe26a9cbd940f5f815f782f9 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:56 +0100 +Subject: [PATCH 025/116] virtiofsd: Open vhost connection instead of mounting +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-22-dgilbert@redhat.com> +Patchwork-id: 93476 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 021/112] virtiofsd: Open vhost connection instead of mounting +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +When run with vhost-user options we conect to the QEMU instead +via a socket. Start this off by creating the socket. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d14bf584dd965821e80d14c16d9292a464b1ab85) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 7 ++-- + tools/virtiofsd/fuse_lowlevel.c | 55 ++++------------------------ + tools/virtiofsd/fuse_virtio.c | 79 +++++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_virtio.h | 23 ++++++++++++ + 4 files changed, 114 insertions(+), 50 deletions(-) + create mode 100644 tools/virtiofsd/fuse_virtio.c + create mode 100644 tools/virtiofsd/fuse_virtio.h + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 26b1a7d..82d6ac7 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -6,9 +6,10 @@ + * See the file COPYING.LIB + */ + +-#define FUSE_USE_VERSION 31 +- ++#ifndef FUSE_I_H ++#define FUSE_I_H + ++#define FUSE_USE_VERSION 31 + #include "fuse.h" + #include "fuse_lowlevel.h" + +@@ -101,3 +102,5 @@ void fuse_session_process_buf_int(struct fuse_session *se, + + /* room needed in buffer to accommodate header */ + #define FUSE_BUFFER_HEADER_SIZE 0x1000 ++ ++#endif +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 17e8718..5df124e 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -14,6 +14,7 @@ + #include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" ++#include "fuse_virtio.h" + + #include + #include +@@ -2202,6 +2203,11 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out4; + } + ++ if (!se->vu_socket_path) { ++ fprintf(stderr, "fuse: missing -o vhost_user_socket option\n"); ++ goto out4; ++ } ++ + se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; + + list_init_req(&se->list); +@@ -2224,54 +2230,7 @@ out1: + + int fuse_session_mount(struct fuse_session *se) + { +- int fd; +- +- /* +- * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos +- * would ensue. +- */ +- do { +- fd = open("/dev/null", O_RDWR); +- if (fd > 2) { +- close(fd); +- } +- } while (fd >= 0 && fd <= 2); +- +- /* +- * To allow FUSE daemons to run without privileges, the caller may open +- * /dev/fuse before launching the file system and pass on the file +- * descriptor by specifying /dev/fd/N as the mount point. Note that the +- * parent process takes care of performing the mount in this case. +- */ +- fd = fuse_mnt_parse_fuse_fd(mountpoint); +- if (fd != -1) { +- if (fcntl(fd, F_GETFD) == -1) { +- fuse_log(FUSE_LOG_ERR, "fuse: Invalid file descriptor /dev/fd/%u\n", +- fd); +- return -1; +- } +- se->fd = fd; +- return 0; +- } +- +- /* Open channel */ +- fd = fuse_kern_mount(mountpoint, se->mo); +- if (fd == -1) { +- return -1; +- } +- se->fd = fd; +- +- /* Save mountpoint */ +- se->mountpoint = strdup(mountpoint); +- if (se->mountpoint == NULL) { +- goto error_out; +- } +- +- return 0; +- +-error_out: +- fuse_kern_unmount(mountpoint, fd); +- return -1; ++ return virtio_session_mount(se); + } + + int fuse_session_fd(struct fuse_session *se) +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +new file mode 100644 +index 0000000..cbef6ff +--- /dev/null ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -0,0 +1,79 @@ ++/* ++ * virtio-fs glue for FUSE ++ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates ++ * ++ * Authors: ++ * Dave Gilbert ++ * ++ * Implements the glue between libfuse and libvhost-user ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ ++ ++#include "fuse_i.h" ++#include "standard-headers/linux/fuse.h" ++#include "fuse_misc.h" ++#include "fuse_opt.h" ++#include "fuse_virtio.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* From spec */ ++struct virtio_fs_config { ++ char tag[36]; ++ uint32_t num_queues; ++}; ++ ++int virtio_session_mount(struct fuse_session *se) ++{ ++ struct sockaddr_un un; ++ mode_t old_umask; ++ ++ if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) { ++ fuse_log(FUSE_LOG_ERR, "Socket path too long\n"); ++ return -1; ++ } ++ ++ se->fd = -1; ++ ++ /* ++ * Create the Unix socket to communicate with qemu ++ * based on QEMU's vhost-user-bridge ++ */ ++ unlink(se->vu_socket_path); ++ strcpy(un.sun_path, se->vu_socket_path); ++ size_t addr_len = sizeof(un); ++ ++ int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0); ++ if (listen_sock == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n"); ++ return -1; ++ } ++ un.sun_family = AF_UNIX; ++ ++ /* ++ * Unfortunately bind doesn't let you set the mask on the socket, ++ * so set umask to 077 and restore it later. ++ */ ++ old_umask = umask(0077); ++ if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); ++ umask(old_umask); ++ return -1; ++ } ++ umask(old_umask); ++ ++ if (listen(listen_sock, 1) == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n"); ++ return -1; ++ } ++ ++ return -1; ++} +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +new file mode 100644 +index 0000000..8f2edb6 +--- /dev/null ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -0,0 +1,23 @@ ++/* ++ * virtio-fs glue for FUSE ++ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates ++ * ++ * Authors: ++ * Dave Gilbert ++ * ++ * Implements the glue between libfuse and libvhost-user ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ ++ ++#ifndef FUSE_VIRTIO_H ++#define FUSE_VIRTIO_H ++ ++#include "fuse_i.h" ++ ++struct fuse_session; ++ ++int virtio_session_mount(struct fuse_session *se); ++ ++#endif +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch b/kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch new file mode 100755 index 0000000..8d8de78 --- /dev/null +++ b/kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch @@ -0,0 +1,76 @@ +From ade3dcad8a907d281549b341a8908851e36ba458 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:31 +0100 +Subject: [PATCH 060/116] virtiofsd: Parse flag FUSE_WRITE_KILL_PRIV +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-57-dgilbert@redhat.com> +Patchwork-id: 93505 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 056/112] virtiofsd: Parse flag FUSE_WRITE_KILL_PRIV +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +Caller can set FUSE_WRITE_KILL_PRIV in write_flags. Parse it and pass it +to the filesystem. + +Signed-off-by: Vivek Goyal +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f779bc5265e7e7abb13a03d4bfbc74151afc15c2) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_common.h | 6 +++++- + tools/virtiofsd/fuse_lowlevel.c | 4 +++- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index f8f6433..686c42c 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -93,8 +93,12 @@ struct fuse_file_info { + */ + unsigned int cache_readdir:1; + ++ /* Indicates that suid/sgid bits should be removed upon write */ ++ unsigned int kill_priv:1; ++ ++ + /** Padding. Reserved for future use*/ +- unsigned int padding:25; ++ unsigned int padding:24; + unsigned int padding2:32; + + /* +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 02e1d83..2d6dc5a 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1142,6 +1142,7 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; ++ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; +@@ -1177,7 +1178,8 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + fi.fh = arg->fh; +- fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; ++ fi.writepage = !!(arg->write_flags & FUSE_WRITE_CACHE); ++ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + if (ibufv->count == 1) { + assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch b/kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch new file mode 100755 index 0000000..7d095c9 --- /dev/null +++ b/kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch @@ -0,0 +1,140 @@ +From d5986c804f05070a07dfe702f7c66357daaa1ab6 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:20 +0100 +Subject: [PATCH 049/116] virtiofsd: Pass write iov's all the way through +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-46-dgilbert@redhat.com> +Patchwork-id: 93497 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 045/112] virtiofsd: Pass write iov's all the way through +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Pass the write iov pointing to guest RAM all the way through rather +than copying the data. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Xiao Yang +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e17f7a580e2c599330ad3a6946be615ca2fe97d9) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 79 +++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 73 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index fd588a4..872968f 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -454,6 +454,10 @@ static void *fv_queue_thread(void *opaque) + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + + while (1) { ++ bool allocated_bufv = false; ++ struct fuse_bufvec bufv; ++ struct fuse_bufvec *pbufv; ++ + /* + * An element contains one request and the space to send our + * response They're spread over multiple descriptors in a +@@ -495,14 +499,76 @@ static void *fv_queue_thread(void *opaque) + __func__, elem->index); + assert(0); /* TODO */ + } +- copy_from_iov(&fbuf, out_num, out_sg); +- fbuf.size = out_len; ++ /* Copy just the first element and look at it */ ++ copy_from_iov(&fbuf, 1, out_sg); ++ ++ if (out_num > 2 && ++ out_sg[0].iov_len == sizeof(struct fuse_in_header) && ++ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && ++ out_sg[1].iov_len == sizeof(struct fuse_write_in)) { ++ /* ++ * For a write we don't actually need to copy the ++ * data, we can just do it straight out of guest memory ++ * but we must still copy the headers in case the guest ++ * was nasty and changed them while we were using them. ++ */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); ++ ++ /* copy the fuse_write_in header after the fuse_in_header */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; ++ ++ /* Allocate the bufv, with space for the rest of the iov */ ++ allocated_bufv = true; ++ pbufv = malloc(sizeof(struct fuse_bufvec) + ++ sizeof(struct fuse_buf) * (out_num - 2)); ++ if (!pbufv) { ++ vu_queue_unpop(dev, q, elem, 0); ++ free(elem); ++ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", ++ __func__); ++ goto out; ++ } ++ ++ pbufv->count = 1; ++ pbufv->buf[0] = fbuf; ++ ++ size_t iovindex, pbufvindex; ++ iovindex = 2; /* 2 headers, separate iovs */ ++ pbufvindex = 1; /* 2 headers, 1 fusebuf */ ++ ++ for (; iovindex < out_num; iovindex++, pbufvindex++) { ++ pbufv->count++; ++ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ ++ pbufv->buf[pbufvindex].flags = 0; ++ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; ++ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; ++ } ++ } else { ++ /* Normal (non fast write) path */ ++ ++ /* Copy the rest of the buffer */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, out_num - 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_len; + +- /* TODO! Endianness of header */ ++ /* TODO! Endianness of header */ + +- /* TODO: Add checks for fuse_session_exited */ +- struct fuse_bufvec bufv = { .buf[0] = fbuf, .count = 1 }; +- fuse_session_process_buf_int(se, &bufv, &ch); ++ /* TODO: Add checks for fuse_session_exited */ ++ bufv.buf[0] = fbuf; ++ bufv.count = 1; ++ pbufv = &bufv; ++ } ++ pbufv->idx = 0; ++ pbufv->off = 0; ++ fuse_session_process_buf_int(se, pbufv, &ch); ++ ++ if (allocated_bufv) { ++ free(pbufv); ++ } + + if (!qi->reply_sent) { + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", +@@ -516,6 +582,7 @@ static void *fv_queue_thread(void *opaque) + elem = NULL; + } + } ++out: + pthread_mutex_destroy(&ch.lock); + free(fbuf.mem); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch b/kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch new file mode 100755 index 0000000..834ced1 --- /dev/null +++ b/kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch @@ -0,0 +1,168 @@ +From 9e4320eec5204da851ac95fb7a7e6520c9ccee7d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:19 +0100 +Subject: [PATCH 048/116] virtiofsd: Plumb fuse_bufvec through to do_write_buf +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-45-dgilbert@redhat.com> +Patchwork-id: 93499 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 044/112] virtiofsd: Plumb fuse_bufvec through to do_write_buf +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Let fuse_session_process_buf_int take a fuse_bufvec * instead of a +fuse_buf; and then through to do_write_buf - where in the best +case it can pass that straight through to op.write_buf without copying +(other than skipping a header). + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 469f9d2fc405b0508e6cf1b4b5bbcadfc82064e5) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 2 +- + tools/virtiofsd/fuse_lowlevel.c | 61 +++++++++++++++++++++++++++-------------- + tools/virtiofsd/fuse_virtio.c | 3 +- + 3 files changed, 44 insertions(+), 22 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 45995f3..a20854f 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -100,7 +100,7 @@ int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + void fuse_free_req(fuse_req_t req); + + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, ++ struct fuse_bufvec *bufv, + struct fuse_chan *ch); + + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 95f4db8..7e10995 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1004,11 +1004,12 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, +- const struct fuse_buf *ibuf) ++ struct fuse_bufvec *ibufv) + { + struct fuse_session *se = req->se; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, ++ struct fuse_bufvec *pbufv = ibufv; ++ struct fuse_bufvec tmpbufv = { ++ .buf[0] = ibufv->buf[0], + .count = 1, + }; + struct fuse_write_in *arg = (struct fuse_write_in *)inarg; +@@ -1018,22 +1019,31 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + fi.fh = arg->fh; + fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; + +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- bufv.buf[0].mem = PARAM(arg); +- } +- +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); +- fuse_reply_err(req, EIO); +- return; ++ if (ibufv->count == 1) { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ tmpbufv.buf[0].mem = PARAM(arg); ++ } ++ tmpbufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); ++ if (tmpbufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: do_write_buf: buffer size too small\n"); ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ tmpbufv.buf[0].size = arg->size; ++ pbufv = &tmpbufv; ++ } else { ++ /* ++ * Input bufv contains the headers in the first element ++ * and the data in the rest, we need to skip that first element ++ */ ++ ibufv->buf[0].size = 0; + } +- bufv.buf[0].size = arg->size; + +- se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); ++ se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); + } + + static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) +@@ -2024,13 +2034,24 @@ static const char *opname(enum fuse_opcode opcode) + void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf) + { +- fuse_session_process_buf_int(se, buf, NULL); ++ struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; ++ fuse_session_process_buf_int(se, &bufv, NULL); + } + ++/* ++ * Restriction: ++ * bufv is normally a single entry buffer, except for a write ++ * where (if it's in memory) then the bufv may be multiple entries, ++ * where the first entry contains all headers and subsequent entries ++ * contain data ++ * bufv shall not use any offsets etc to make the data anything ++ * other than contiguous starting from 0. ++ */ + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, ++ struct fuse_bufvec *bufv, + struct fuse_chan *ch) + { ++ const struct fuse_buf *buf = bufv->buf; + struct fuse_in_header *in; + const void *inarg; + struct fuse_req *req; +@@ -2108,7 +2129,7 @@ void fuse_session_process_buf_int(struct fuse_session *se, + + inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { +- do_write_buf(req, in->nodeid, inarg, buf); ++ do_write_buf(req, in->nodeid, inarg, bufv); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + } +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 635f877..fd588a4 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -501,7 +501,8 @@ static void *fv_queue_thread(void *opaque) + /* TODO! Endianness of header */ + + /* TODO: Add checks for fuse_session_exited */ +- fuse_session_process_buf_int(se, &fbuf, &ch); ++ struct fuse_bufvec bufv = { .buf[0] = fbuf, .count = 1 }; ++ fuse_session_process_buf_int(se, &bufv, &ch); + + if (!qi->reply_sent) { + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Poll-kick_fd-for-queue.patch b/kvm-virtiofsd-Poll-kick_fd-for-queue.patch new file mode 100755 index 0000000..d7c6c0a --- /dev/null +++ b/kvm-virtiofsd-Poll-kick_fd-for-queue.patch @@ -0,0 +1,97 @@ +From 083b944fac29bc3115a19eb38e176f6b23f04938 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:01 +0100 +Subject: [PATCH 030/116] virtiofsd: Poll kick_fd for queue +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-27-dgilbert@redhat.com> +Patchwork-id: 93483 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 026/112] virtiofsd: Poll kick_fd for queue +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +In the queue thread poll the kick_fd we're passed. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5dcd1f56141378226d33dc3df68ec57913e0aa04) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 40 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 39 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 2a94bb3..05e7258 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -100,13 +101,50 @@ static void fv_panic(VuDev *dev, const char *err) + exit(EXIT_FAILURE); + } + ++/* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { + struct fv_QueueInfo *qi = opaque; + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { +- /* TODO */ ++ struct pollfd pf[1]; ++ pf[0].fd = qi->kick_fd; ++ pf[0].events = POLLIN; ++ pf[0].revents = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__, ++ qi->qidx); ++ int poll_res = ppoll(pf, 1, NULL, NULL); ++ ++ if (poll_res == -1) { ++ if (errno == EINTR) { ++ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", ++ __func__); ++ continue; ++ } ++ fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n"); ++ break; ++ } ++ assert(poll_res == 1); ++ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { ++ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n", ++ __func__, pf[0].revents, qi->qidx); ++ break; ++ } ++ assert(pf[0].revents & POLLIN); ++ fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__, ++ qi->qidx); ++ ++ eventfd_t evalue; ++ if (eventfd_read(qi->kick_fd, &evalue)) { ++ fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); ++ break; ++ } ++ if (qi->virtio_dev->se->debug) { ++ fprintf(stderr, "%s: Queue %d gave evalue: %zx\n", __func__, ++ qi->qidx, (size_t)evalue); ++ } + } + + return NULL; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch b/kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch new file mode 100755 index 0000000..d4e1ea1 --- /dev/null +++ b/kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch @@ -0,0 +1,144 @@ +From ab336e3aea97d76c1b2ac725d19b4518f47dd8f0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:59 +0100 +Subject: [PATCH 088/116] virtiofsd: Prevent multiply running with same + vhost_user_socket +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-85-dgilbert@redhat.com> +Patchwork-id: 93541 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 084/112] virtiofsd: Prevent multiply running with same vhost_user_socket +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +virtiofsd can run multiply even if the vhost_user_socket is same path. + + ]# ./virtiofsd -o vhost_user_socket=/tmp/vhostqemu -o source=/tmp/share & + [1] 244965 + virtio_session_mount: Waiting for vhost-user socket connection... + ]# ./virtiofsd -o vhost_user_socket=/tmp/vhostqemu -o source=/tmp/share & + [2] 244966 + virtio_session_mount: Waiting for vhost-user socket connection... + ]# + +The user will get confused about the situation and maybe the cause of the +unexpected problem. So it's better to prevent the multiple running. + +Create a regular file under localstatedir directory to exclude the +vhost_user_socket. To create and lock the file, use qemu_write_pidfile() +because the API has some sanity checks and file lock. + +Signed-off-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert + Applied fixes from Stefan's review and moved osdep include +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 96814800d2b49d18737c36e021c387697ec40c62) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 1 + + tools/virtiofsd/fuse_virtio.c | 49 ++++++++++++++++++++++++++++++++++++++++- + 2 files changed, 49 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 440508a..aac282f 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -18,6 +18,7 @@ + + #include + #include ++#include + #include + #include + #include +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index e7bd772..b7948de 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -13,11 +13,12 @@ + + #include "qemu/osdep.h" + #include "qemu/iov.h" +-#include "fuse_virtio.h" ++#include "qapi/error.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" ++#include "fuse_virtio.h" + + #include + #include +@@ -743,6 +744,42 @@ int virtio_loop(struct fuse_session *se) + return 0; + } + ++static void strreplace(char *s, char old, char new) ++{ ++ for (; *s; ++s) { ++ if (*s == old) { ++ *s = new; ++ } ++ } ++} ++ ++static bool fv_socket_lock(struct fuse_session *se) ++{ ++ g_autofree gchar *sk_name = NULL; ++ g_autofree gchar *pidfile = NULL; ++ g_autofree gchar *dir = NULL; ++ Error *local_err = NULL; ++ ++ dir = qemu_get_local_state_pathname("run/virtiofsd"); ++ ++ if (g_mkdir_with_parents(dir, S_IRWXU) < 0) { ++ fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s", ++ __func__, dir, strerror(errno)); ++ return false; ++ } ++ ++ sk_name = g_strdup(se->vu_socket_path); ++ strreplace(sk_name, '/', '.'); ++ pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name); ++ ++ if (!qemu_write_pidfile(pidfile, &local_err)) { ++ error_report_err(local_err); ++ return false; ++ } ++ ++ return true; ++} ++ + static int fv_create_listen_socket(struct fuse_session *se) + { + struct sockaddr_un un; +@@ -758,6 +795,16 @@ static int fv_create_listen_socket(struct fuse_session *se) + return -1; + } + ++ if (!strlen(se->vu_socket_path)) { ++ fuse_log(FUSE_LOG_ERR, "Socket path is empty\n"); ++ return -1; ++ } ++ ++ /* Check the vu_socket_path is already used */ ++ if (!fv_socket_lock(se)) { ++ return -1; ++ } ++ + /* + * Create the Unix socket to communicate with qemu + * based on QEMU's vhost-user-bridge +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch b/kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch new file mode 100755 index 0000000..f30f23a --- /dev/null +++ b/kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch @@ -0,0 +1,945 @@ +From e7c1ad608117b21f80c762f5505a66b21c56e9d3 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:40 +0100 +Subject: [PATCH 009/116] virtiofsd: Pull in kernel's fuse.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-6-dgilbert@redhat.com> +Patchwork-id: 93460 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 005/112] virtiofsd: Pull in kernel's fuse.h +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Update scripts/update-linux-headers.sh to add fuse.h and +use it to pull in fuse.h from the kernel; from v5.5-rc1 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a62a9e192bc5f0aa0bc076b51db5a069add87c78) +Signed-off-by: Miroslav Rezanina +--- + include/standard-headers/linux/fuse.h | 891 ++++++++++++++++++++++++++++++++++ + scripts/update-linux-headers.sh | 1 + + 2 files changed, 892 insertions(+) + create mode 100644 include/standard-headers/linux/fuse.h + +diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h +new file mode 100644 +index 0000000..f4df0a4 +--- /dev/null ++++ b/include/standard-headers/linux/fuse.h +@@ -0,0 +1,891 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ ++/* ++ This file defines the kernel interface of FUSE ++ Copyright (C) 2001-2008 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU GPL. ++ See the file COPYING. ++ ++ This -- and only this -- header file may also be distributed under ++ the terms of the BSD Licence as follows: ++ ++ Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ 1. Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ ++ THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE ++ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ SUCH DAMAGE. ++*/ ++ ++/* ++ * This file defines the kernel interface of FUSE ++ * ++ * Protocol changelog: ++ * ++ * 7.1: ++ * - add the following messages: ++ * FUSE_SETATTR, FUSE_SYMLINK, FUSE_MKNOD, FUSE_MKDIR, FUSE_UNLINK, ++ * FUSE_RMDIR, FUSE_RENAME, FUSE_LINK, FUSE_OPEN, FUSE_READ, FUSE_WRITE, ++ * FUSE_RELEASE, FUSE_FSYNC, FUSE_FLUSH, FUSE_SETXATTR, FUSE_GETXATTR, ++ * FUSE_LISTXATTR, FUSE_REMOVEXATTR, FUSE_OPENDIR, FUSE_READDIR, ++ * FUSE_RELEASEDIR ++ * - add padding to messages to accommodate 32-bit servers on 64-bit kernels ++ * ++ * 7.2: ++ * - add FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE flags ++ * - add FUSE_FSYNCDIR message ++ * ++ * 7.3: ++ * - add FUSE_ACCESS message ++ * - add FUSE_CREATE message ++ * - add filehandle to fuse_setattr_in ++ * ++ * 7.4: ++ * - add frsize to fuse_kstatfs ++ * - clean up request size limit checking ++ * ++ * 7.5: ++ * - add flags and max_write to fuse_init_out ++ * ++ * 7.6: ++ * - add max_readahead to fuse_init_in and fuse_init_out ++ * ++ * 7.7: ++ * - add FUSE_INTERRUPT message ++ * - add POSIX file lock support ++ * ++ * 7.8: ++ * - add lock_owner and flags fields to fuse_release_in ++ * - add FUSE_BMAP message ++ * - add FUSE_DESTROY message ++ * ++ * 7.9: ++ * - new fuse_getattr_in input argument of GETATTR ++ * - add lk_flags in fuse_lk_in ++ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in ++ * - add blksize field to fuse_attr ++ * - add file flags field to fuse_read_in and fuse_write_in ++ * - Add ATIME_NOW and MTIME_NOW flags to fuse_setattr_in ++ * ++ * 7.10 ++ * - add nonseekable open flag ++ * ++ * 7.11 ++ * - add IOCTL message ++ * - add unsolicited notification support ++ * - add POLL message and NOTIFY_POLL notification ++ * ++ * 7.12 ++ * - add umask flag to input argument of create, mknod and mkdir ++ * - add notification messages for invalidation of inodes and ++ * directory entries ++ * ++ * 7.13 ++ * - make max number of background requests and congestion threshold ++ * tunables ++ * ++ * 7.14 ++ * - add splice support to fuse device ++ * ++ * 7.15 ++ * - add store notify ++ * - add retrieve notify ++ * ++ * 7.16 ++ * - add BATCH_FORGET request ++ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct ++ * fuse_ioctl_iovec' instead of ambiguous 'struct iovec' ++ * - add FUSE_IOCTL_32BIT flag ++ * ++ * 7.17 ++ * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK ++ * ++ * 7.18 ++ * - add FUSE_IOCTL_DIR flag ++ * - add FUSE_NOTIFY_DELETE ++ * ++ * 7.19 ++ * - add FUSE_FALLOCATE ++ * ++ * 7.20 ++ * - add FUSE_AUTO_INVAL_DATA ++ * ++ * 7.21 ++ * - add FUSE_READDIRPLUS ++ * - send the requested events in POLL request ++ * ++ * 7.22 ++ * - add FUSE_ASYNC_DIO ++ * ++ * 7.23 ++ * - add FUSE_WRITEBACK_CACHE ++ * - add time_gran to fuse_init_out ++ * - add reserved space to fuse_init_out ++ * - add FATTR_CTIME ++ * - add ctime and ctimensec to fuse_setattr_in ++ * - add FUSE_RENAME2 request ++ * - add FUSE_NO_OPEN_SUPPORT flag ++ * ++ * 7.24 ++ * - add FUSE_LSEEK for SEEK_HOLE and SEEK_DATA support ++ * ++ * 7.25 ++ * - add FUSE_PARALLEL_DIROPS ++ * ++ * 7.26 ++ * - add FUSE_HANDLE_KILLPRIV ++ * - add FUSE_POSIX_ACL ++ * ++ * 7.27 ++ * - add FUSE_ABORT_ERROR ++ * ++ * 7.28 ++ * - add FUSE_COPY_FILE_RANGE ++ * - add FOPEN_CACHE_DIR ++ * - add FUSE_MAX_PAGES, add max_pages to init_out ++ * - add FUSE_CACHE_SYMLINKS ++ * ++ * 7.29 ++ * - add FUSE_NO_OPENDIR_SUPPORT flag ++ * ++ * 7.30 ++ * - add FUSE_EXPLICIT_INVAL_DATA ++ * - add FUSE_IOCTL_COMPAT_X32 ++ * ++ * 7.31 ++ * - add FUSE_WRITE_KILL_PRIV flag ++ * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING ++ * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag ++ */ ++ ++#ifndef _LINUX_FUSE_H ++#define _LINUX_FUSE_H ++ ++#include ++ ++/* ++ * Version negotiation: ++ * ++ * Both the kernel and userspace send the version they support in the ++ * INIT request and reply respectively. ++ * ++ * If the major versions match then both shall use the smallest ++ * of the two minor versions for communication. ++ * ++ * If the kernel supports a larger major version, then userspace shall ++ * reply with the major version it supports, ignore the rest of the ++ * INIT message and expect a new INIT message from the kernel with a ++ * matching major version. ++ * ++ * If the library supports a larger major version, then it shall fall ++ * back to the major protocol version sent by the kernel for ++ * communication and reply with that major version (and an arbitrary ++ * supported minor version). ++ */ ++ ++/** Version number of this interface */ ++#define FUSE_KERNEL_VERSION 7 ++ ++/** Minor version number of this interface */ ++#define FUSE_KERNEL_MINOR_VERSION 31 ++ ++/** The node ID of the root inode */ ++#define FUSE_ROOT_ID 1 ++ ++/* Make sure all structures are padded to 64bit boundary, so 32bit ++ userspace works under 64bit kernels */ ++ ++struct fuse_attr { ++ uint64_t ino; ++ uint64_t size; ++ uint64_t blocks; ++ uint64_t atime; ++ uint64_t mtime; ++ uint64_t ctime; ++ uint32_t atimensec; ++ uint32_t mtimensec; ++ uint32_t ctimensec; ++ uint32_t mode; ++ uint32_t nlink; ++ uint32_t uid; ++ uint32_t gid; ++ uint32_t rdev; ++ uint32_t blksize; ++ uint32_t padding; ++}; ++ ++struct fuse_kstatfs { ++ uint64_t blocks; ++ uint64_t bfree; ++ uint64_t bavail; ++ uint64_t files; ++ uint64_t ffree; ++ uint32_t bsize; ++ uint32_t namelen; ++ uint32_t frsize; ++ uint32_t padding; ++ uint32_t spare[6]; ++}; ++ ++struct fuse_file_lock { ++ uint64_t start; ++ uint64_t end; ++ uint32_t type; ++ uint32_t pid; /* tgid */ ++}; ++ ++/** ++ * Bitmasks for fuse_setattr_in.valid ++ */ ++#define FATTR_MODE (1 << 0) ++#define FATTR_UID (1 << 1) ++#define FATTR_GID (1 << 2) ++#define FATTR_SIZE (1 << 3) ++#define FATTR_ATIME (1 << 4) ++#define FATTR_MTIME (1 << 5) ++#define FATTR_FH (1 << 6) ++#define FATTR_ATIME_NOW (1 << 7) ++#define FATTR_MTIME_NOW (1 << 8) ++#define FATTR_LOCKOWNER (1 << 9) ++#define FATTR_CTIME (1 << 10) ++ ++/** ++ * Flags returned by the OPEN request ++ * ++ * FOPEN_DIRECT_IO: bypass page cache for this open file ++ * FOPEN_KEEP_CACHE: don't invalidate the data cache on open ++ * FOPEN_NONSEEKABLE: the file is not seekable ++ * FOPEN_CACHE_DIR: allow caching this directory ++ * FOPEN_STREAM: the file is stream-like (no file position at all) ++ */ ++#define FOPEN_DIRECT_IO (1 << 0) ++#define FOPEN_KEEP_CACHE (1 << 1) ++#define FOPEN_NONSEEKABLE (1 << 2) ++#define FOPEN_CACHE_DIR (1 << 3) ++#define FOPEN_STREAM (1 << 4) ++ ++/** ++ * INIT request/reply flags ++ * ++ * FUSE_ASYNC_READ: asynchronous read requests ++ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks ++ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) ++ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem ++ * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." ++ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB ++ * FUSE_DONT_MASK: don't apply umask to file mode on create operations ++ * FUSE_SPLICE_WRITE: kernel supports splice write on the device ++ * FUSE_SPLICE_MOVE: kernel supports splice move on the device ++ * FUSE_SPLICE_READ: kernel supports splice read on the device ++ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks ++ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories ++ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages ++ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one) ++ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus ++ * FUSE_ASYNC_DIO: asynchronous direct I/O submission ++ * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes ++ * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens ++ * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir ++ * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc ++ * FUSE_POSIX_ACL: filesystem supports posix acls ++ * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED ++ * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages ++ * FUSE_CACHE_SYMLINKS: cache READLINK responses ++ * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir ++ * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request ++ * FUSE_MAP_ALIGNMENT: map_alignment field is valid ++ */ ++#define FUSE_ASYNC_READ (1 << 0) ++#define FUSE_POSIX_LOCKS (1 << 1) ++#define FUSE_FILE_OPS (1 << 2) ++#define FUSE_ATOMIC_O_TRUNC (1 << 3) ++#define FUSE_EXPORT_SUPPORT (1 << 4) ++#define FUSE_BIG_WRITES (1 << 5) ++#define FUSE_DONT_MASK (1 << 6) ++#define FUSE_SPLICE_WRITE (1 << 7) ++#define FUSE_SPLICE_MOVE (1 << 8) ++#define FUSE_SPLICE_READ (1 << 9) ++#define FUSE_FLOCK_LOCKS (1 << 10) ++#define FUSE_HAS_IOCTL_DIR (1 << 11) ++#define FUSE_AUTO_INVAL_DATA (1 << 12) ++#define FUSE_DO_READDIRPLUS (1 << 13) ++#define FUSE_READDIRPLUS_AUTO (1 << 14) ++#define FUSE_ASYNC_DIO (1 << 15) ++#define FUSE_WRITEBACK_CACHE (1 << 16) ++#define FUSE_NO_OPEN_SUPPORT (1 << 17) ++#define FUSE_PARALLEL_DIROPS (1 << 18) ++#define FUSE_HANDLE_KILLPRIV (1 << 19) ++#define FUSE_POSIX_ACL (1 << 20) ++#define FUSE_ABORT_ERROR (1 << 21) ++#define FUSE_MAX_PAGES (1 << 22) ++#define FUSE_CACHE_SYMLINKS (1 << 23) ++#define FUSE_NO_OPENDIR_SUPPORT (1 << 24) ++#define FUSE_EXPLICIT_INVAL_DATA (1 << 25) ++#define FUSE_MAP_ALIGNMENT (1 << 26) ++ ++/** ++ * CUSE INIT request/reply flags ++ * ++ * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl ++ */ ++#define CUSE_UNRESTRICTED_IOCTL (1 << 0) ++ ++/** ++ * Release flags ++ */ ++#define FUSE_RELEASE_FLUSH (1 << 0) ++#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1) ++ ++/** ++ * Getattr flags ++ */ ++#define FUSE_GETATTR_FH (1 << 0) ++ ++/** ++ * Lock flags ++ */ ++#define FUSE_LK_FLOCK (1 << 0) ++ ++/** ++ * WRITE flags ++ * ++ * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed ++ * FUSE_WRITE_LOCKOWNER: lock_owner field is valid ++ * FUSE_WRITE_KILL_PRIV: kill suid and sgid bits ++ */ ++#define FUSE_WRITE_CACHE (1 << 0) ++#define FUSE_WRITE_LOCKOWNER (1 << 1) ++#define FUSE_WRITE_KILL_PRIV (1 << 2) ++ ++/** ++ * Read flags ++ */ ++#define FUSE_READ_LOCKOWNER (1 << 1) ++ ++/** ++ * Ioctl flags ++ * ++ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine ++ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed ++ * FUSE_IOCTL_RETRY: retry with new iovecs ++ * FUSE_IOCTL_32BIT: 32bit ioctl ++ * FUSE_IOCTL_DIR: is a directory ++ * FUSE_IOCTL_COMPAT_X32: x32 compat ioctl on 64bit machine (64bit time_t) ++ * ++ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs ++ */ ++#define FUSE_IOCTL_COMPAT (1 << 0) ++#define FUSE_IOCTL_UNRESTRICTED (1 << 1) ++#define FUSE_IOCTL_RETRY (1 << 2) ++#define FUSE_IOCTL_32BIT (1 << 3) ++#define FUSE_IOCTL_DIR (1 << 4) ++#define FUSE_IOCTL_COMPAT_X32 (1 << 5) ++ ++#define FUSE_IOCTL_MAX_IOV 256 ++ ++/** ++ * Poll flags ++ * ++ * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify ++ */ ++#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0) ++ ++/** ++ * Fsync flags ++ * ++ * FUSE_FSYNC_FDATASYNC: Sync data only, not metadata ++ */ ++#define FUSE_FSYNC_FDATASYNC (1 << 0) ++ ++enum fuse_opcode { ++ FUSE_LOOKUP = 1, ++ FUSE_FORGET = 2, /* no reply */ ++ FUSE_GETATTR = 3, ++ FUSE_SETATTR = 4, ++ FUSE_READLINK = 5, ++ FUSE_SYMLINK = 6, ++ FUSE_MKNOD = 8, ++ FUSE_MKDIR = 9, ++ FUSE_UNLINK = 10, ++ FUSE_RMDIR = 11, ++ FUSE_RENAME = 12, ++ FUSE_LINK = 13, ++ FUSE_OPEN = 14, ++ FUSE_READ = 15, ++ FUSE_WRITE = 16, ++ FUSE_STATFS = 17, ++ FUSE_RELEASE = 18, ++ FUSE_FSYNC = 20, ++ FUSE_SETXATTR = 21, ++ FUSE_GETXATTR = 22, ++ FUSE_LISTXATTR = 23, ++ FUSE_REMOVEXATTR = 24, ++ FUSE_FLUSH = 25, ++ FUSE_INIT = 26, ++ FUSE_OPENDIR = 27, ++ FUSE_READDIR = 28, ++ FUSE_RELEASEDIR = 29, ++ FUSE_FSYNCDIR = 30, ++ FUSE_GETLK = 31, ++ FUSE_SETLK = 32, ++ FUSE_SETLKW = 33, ++ FUSE_ACCESS = 34, ++ FUSE_CREATE = 35, ++ FUSE_INTERRUPT = 36, ++ FUSE_BMAP = 37, ++ FUSE_DESTROY = 38, ++ FUSE_IOCTL = 39, ++ FUSE_POLL = 40, ++ FUSE_NOTIFY_REPLY = 41, ++ FUSE_BATCH_FORGET = 42, ++ FUSE_FALLOCATE = 43, ++ FUSE_READDIRPLUS = 44, ++ FUSE_RENAME2 = 45, ++ FUSE_LSEEK = 46, ++ FUSE_COPY_FILE_RANGE = 47, ++ FUSE_SETUPMAPPING = 48, ++ FUSE_REMOVEMAPPING = 49, ++ ++ /* CUSE specific operations */ ++ CUSE_INIT = 4096, ++ ++ /* Reserved opcodes: helpful to detect structure endian-ness */ ++ CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ ++ FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ ++}; ++ ++enum fuse_notify_code { ++ FUSE_NOTIFY_POLL = 1, ++ FUSE_NOTIFY_INVAL_INODE = 2, ++ FUSE_NOTIFY_INVAL_ENTRY = 3, ++ FUSE_NOTIFY_STORE = 4, ++ FUSE_NOTIFY_RETRIEVE = 5, ++ FUSE_NOTIFY_DELETE = 6, ++ FUSE_NOTIFY_CODE_MAX, ++}; ++ ++/* The read buffer is required to be at least 8k, but may be much larger */ ++#define FUSE_MIN_READ_BUFFER 8192 ++ ++#define FUSE_COMPAT_ENTRY_OUT_SIZE 120 ++ ++struct fuse_entry_out { ++ uint64_t nodeid; /* Inode ID */ ++ uint64_t generation; /* Inode generation: nodeid:gen must ++ be unique for the fs's lifetime */ ++ uint64_t entry_valid; /* Cache timeout for the name */ ++ uint64_t attr_valid; /* Cache timeout for the attributes */ ++ uint32_t entry_valid_nsec; ++ uint32_t attr_valid_nsec; ++ struct fuse_attr attr; ++}; ++ ++struct fuse_forget_in { ++ uint64_t nlookup; ++}; ++ ++struct fuse_forget_one { ++ uint64_t nodeid; ++ uint64_t nlookup; ++}; ++ ++struct fuse_batch_forget_in { ++ uint32_t count; ++ uint32_t dummy; ++}; ++ ++struct fuse_getattr_in { ++ uint32_t getattr_flags; ++ uint32_t dummy; ++ uint64_t fh; ++}; ++ ++#define FUSE_COMPAT_ATTR_OUT_SIZE 96 ++ ++struct fuse_attr_out { ++ uint64_t attr_valid; /* Cache timeout for the attributes */ ++ uint32_t attr_valid_nsec; ++ uint32_t dummy; ++ struct fuse_attr attr; ++}; ++ ++#define FUSE_COMPAT_MKNOD_IN_SIZE 8 ++ ++struct fuse_mknod_in { ++ uint32_t mode; ++ uint32_t rdev; ++ uint32_t umask; ++ uint32_t padding; ++}; ++ ++struct fuse_mkdir_in { ++ uint32_t mode; ++ uint32_t umask; ++}; ++ ++struct fuse_rename_in { ++ uint64_t newdir; ++}; ++ ++struct fuse_rename2_in { ++ uint64_t newdir; ++ uint32_t flags; ++ uint32_t padding; ++}; ++ ++struct fuse_link_in { ++ uint64_t oldnodeid; ++}; ++ ++struct fuse_setattr_in { ++ uint32_t valid; ++ uint32_t padding; ++ uint64_t fh; ++ uint64_t size; ++ uint64_t lock_owner; ++ uint64_t atime; ++ uint64_t mtime; ++ uint64_t ctime; ++ uint32_t atimensec; ++ uint32_t mtimensec; ++ uint32_t ctimensec; ++ uint32_t mode; ++ uint32_t unused4; ++ uint32_t uid; ++ uint32_t gid; ++ uint32_t unused5; ++}; ++ ++struct fuse_open_in { ++ uint32_t flags; ++ uint32_t unused; ++}; ++ ++struct fuse_create_in { ++ uint32_t flags; ++ uint32_t mode; ++ uint32_t umask; ++ uint32_t padding; ++}; ++ ++struct fuse_open_out { ++ uint64_t fh; ++ uint32_t open_flags; ++ uint32_t padding; ++}; ++ ++struct fuse_release_in { ++ uint64_t fh; ++ uint32_t flags; ++ uint32_t release_flags; ++ uint64_t lock_owner; ++}; ++ ++struct fuse_flush_in { ++ uint64_t fh; ++ uint32_t unused; ++ uint32_t padding; ++ uint64_t lock_owner; ++}; ++ ++struct fuse_read_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t read_flags; ++ uint64_t lock_owner; ++ uint32_t flags; ++ uint32_t padding; ++}; ++ ++#define FUSE_COMPAT_WRITE_IN_SIZE 24 ++ ++struct fuse_write_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t write_flags; ++ uint64_t lock_owner; ++ uint32_t flags; ++ uint32_t padding; ++}; ++ ++struct fuse_write_out { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++#define FUSE_COMPAT_STATFS_SIZE 48 ++ ++struct fuse_statfs_out { ++ struct fuse_kstatfs st; ++}; ++ ++struct fuse_fsync_in { ++ uint64_t fh; ++ uint32_t fsync_flags; ++ uint32_t padding; ++}; ++ ++struct fuse_setxattr_in { ++ uint32_t size; ++ uint32_t flags; ++}; ++ ++struct fuse_getxattr_in { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++struct fuse_getxattr_out { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++struct fuse_lk_in { ++ uint64_t fh; ++ uint64_t owner; ++ struct fuse_file_lock lk; ++ uint32_t lk_flags; ++ uint32_t padding; ++}; ++ ++struct fuse_lk_out { ++ struct fuse_file_lock lk; ++}; ++ ++struct fuse_access_in { ++ uint32_t mask; ++ uint32_t padding; ++}; ++ ++struct fuse_init_in { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t max_readahead; ++ uint32_t flags; ++}; ++ ++#define FUSE_COMPAT_INIT_OUT_SIZE 8 ++#define FUSE_COMPAT_22_INIT_OUT_SIZE 24 ++ ++struct fuse_init_out { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t max_readahead; ++ uint32_t flags; ++ uint16_t max_background; ++ uint16_t congestion_threshold; ++ uint32_t max_write; ++ uint32_t time_gran; ++ uint16_t max_pages; ++ uint16_t map_alignment; ++ uint32_t unused[8]; ++}; ++ ++#define CUSE_INIT_INFO_MAX 4096 ++ ++struct cuse_init_in { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t unused; ++ uint32_t flags; ++}; ++ ++struct cuse_init_out { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t unused; ++ uint32_t flags; ++ uint32_t max_read; ++ uint32_t max_write; ++ uint32_t dev_major; /* chardev major */ ++ uint32_t dev_minor; /* chardev minor */ ++ uint32_t spare[10]; ++}; ++ ++struct fuse_interrupt_in { ++ uint64_t unique; ++}; ++ ++struct fuse_bmap_in { ++ uint64_t block; ++ uint32_t blocksize; ++ uint32_t padding; ++}; ++ ++struct fuse_bmap_out { ++ uint64_t block; ++}; ++ ++struct fuse_ioctl_in { ++ uint64_t fh; ++ uint32_t flags; ++ uint32_t cmd; ++ uint64_t arg; ++ uint32_t in_size; ++ uint32_t out_size; ++}; ++ ++struct fuse_ioctl_iovec { ++ uint64_t base; ++ uint64_t len; ++}; ++ ++struct fuse_ioctl_out { ++ int32_t result; ++ uint32_t flags; ++ uint32_t in_iovs; ++ uint32_t out_iovs; ++}; ++ ++struct fuse_poll_in { ++ uint64_t fh; ++ uint64_t kh; ++ uint32_t flags; ++ uint32_t events; ++}; ++ ++struct fuse_poll_out { ++ uint32_t revents; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_poll_wakeup_out { ++ uint64_t kh; ++}; ++ ++struct fuse_fallocate_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint64_t length; ++ uint32_t mode; ++ uint32_t padding; ++}; ++ ++struct fuse_in_header { ++ uint32_t len; ++ uint32_t opcode; ++ uint64_t unique; ++ uint64_t nodeid; ++ uint32_t uid; ++ uint32_t gid; ++ uint32_t pid; ++ uint32_t padding; ++}; ++ ++struct fuse_out_header { ++ uint32_t len; ++ int32_t error; ++ uint64_t unique; ++}; ++ ++struct fuse_dirent { ++ uint64_t ino; ++ uint64_t off; ++ uint32_t namelen; ++ uint32_t type; ++ char name[]; ++}; ++ ++#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) ++#define FUSE_DIRENT_ALIGN(x) \ ++ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) ++#define FUSE_DIRENT_SIZE(d) \ ++ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) ++ ++struct fuse_direntplus { ++ struct fuse_entry_out entry_out; ++ struct fuse_dirent dirent; ++}; ++ ++#define FUSE_NAME_OFFSET_DIRENTPLUS \ ++ offsetof(struct fuse_direntplus, dirent.name) ++#define FUSE_DIRENTPLUS_SIZE(d) \ ++ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen) ++ ++struct fuse_notify_inval_inode_out { ++ uint64_t ino; ++ int64_t off; ++ int64_t len; ++}; ++ ++struct fuse_notify_inval_entry_out { ++ uint64_t parent; ++ uint32_t namelen; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_delete_out { ++ uint64_t parent; ++ uint64_t child; ++ uint32_t namelen; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_store_out { ++ uint64_t nodeid; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_retrieve_out { ++ uint64_t notify_unique; ++ uint64_t nodeid; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++/* Matches the size of fuse_write_in */ ++struct fuse_notify_retrieve_in { ++ uint64_t dummy1; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t dummy2; ++ uint64_t dummy3; ++ uint64_t dummy4; ++}; ++ ++/* Device ioctls: */ ++#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) ++ ++struct fuse_lseek_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint32_t whence; ++ uint32_t padding; ++}; ++ ++struct fuse_lseek_out { ++ uint64_t offset; ++}; ++ ++struct fuse_copy_file_range_in { ++ uint64_t fh_in; ++ uint64_t off_in; ++ uint64_t nodeid_out; ++ uint64_t fh_out; ++ uint64_t off_out; ++ uint64_t len; ++ uint64_t flags; ++}; ++ ++#endif /* _LINUX_FUSE_H */ +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index f76d773..29c27f4 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -186,6 +186,7 @@ rm -rf "$output/include/standard-headers/linux" + mkdir -p "$output/include/standard-headers/linux" + for i in "$tmpdir"/include/linux/*virtio*.h \ + "$tmpdir/include/linux/qemu_fw_cfg.h" \ ++ "$tmpdir/include/linux/fuse.h" \ + "$tmpdir/include/linux/input.h" \ + "$tmpdir/include/linux/input-event-codes.h" \ + "$tmpdir/include/linux/pci_regs.h" \ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Pull-in-upstream-headers.patch b/kvm-virtiofsd-Pull-in-upstream-headers.patch new file mode 100755 index 0000000..78784fb --- /dev/null +++ b/kvm-virtiofsd-Pull-in-upstream-headers.patch @@ -0,0 +1,4911 @@ +From 434b51e5c2fce756906dec4803900397bc98ad72 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:39 +0100 +Subject: [PATCH 008/116] virtiofsd: Pull in upstream headers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-5-dgilbert@redhat.com> +Patchwork-id: 93457 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 004/112] virtiofsd: Pull in upstream headers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Pull in headers fromlibfuse's upstream fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ee46c78901eb7fa78e328e04c0494ad6d207238b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse.h | 1275 ++++++++++++++++++++ + tools/virtiofsd/fuse_common.h | 823 +++++++++++++ + tools/virtiofsd/fuse_i.h | 139 +++ + tools/virtiofsd/fuse_log.h | 82 ++ + tools/virtiofsd/fuse_lowlevel.h | 2089 +++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_misc.h | 59 + + tools/virtiofsd/fuse_opt.h | 271 +++++ + tools/virtiofsd/passthrough_helpers.h | 76 ++ + 8 files changed, 4814 insertions(+) + create mode 100644 tools/virtiofsd/fuse.h + create mode 100644 tools/virtiofsd/fuse_common.h + create mode 100644 tools/virtiofsd/fuse_i.h + create mode 100644 tools/virtiofsd/fuse_log.h + create mode 100644 tools/virtiofsd/fuse_lowlevel.h + create mode 100644 tools/virtiofsd/fuse_misc.h + create mode 100644 tools/virtiofsd/fuse_opt.h + create mode 100644 tools/virtiofsd/passthrough_helpers.h + +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +new file mode 100644 +index 0000000..883f6e5 +--- /dev/null ++++ b/tools/virtiofsd/fuse.h +@@ -0,0 +1,1275 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_H_ ++#define FUSE_H_ ++ ++/** @file ++ * ++ * This file defines the library interface of FUSE ++ * ++ * IMPORTANT: you should define FUSE_USE_VERSION before including this header. ++ */ ++ ++#include "fuse_common.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ----------------------------------------------------------- * ++ * Basic FUSE API * ++ * ----------------------------------------------------------- */ ++ ++/** Handle for a FUSE filesystem */ ++struct fuse; ++ ++/** ++ * Readdir flags, passed to ->readdir() ++ */ ++enum fuse_readdir_flags { ++ /** ++ * "Plus" mode. ++ * ++ * The kernel wants to prefill the inode cache during readdir. The ++ * filesystem may honour this by filling in the attributes and setting ++ * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also ++ * just ignore this flag completely. ++ */ ++ FUSE_READDIR_PLUS = (1 << 0), ++}; ++ ++enum fuse_fill_dir_flags { ++ /** ++ * "Plus" mode: all file attributes are valid ++ * ++ * The attributes are used by the kernel to prefill the inode cache ++ * during a readdir. ++ * ++ * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set ++ * and vice versa. ++ */ ++ FUSE_FILL_DIR_PLUS = (1 << 1), ++}; ++ ++/** Function to add an entry in a readdir() operation ++ * ++ * The *off* parameter can be any non-zero value that enables the ++ * filesystem to identify the current point in the directory ++ * stream. It does not need to be the actual physical position. A ++ * value of zero is reserved to indicate that seeking in directories ++ * is not supported. ++ * ++ * @param buf the buffer passed to the readdir() operation ++ * @param name the file name of the directory entry ++ * @param stat file attributes, can be NULL ++ * @param off offset of the next entry or zero ++ * @param flags fill flags ++ * @return 1 if buffer is full, zero otherwise ++ */ ++typedef int (*fuse_fill_dir_t) (void *buf, const char *name, ++ const struct stat *stbuf, off_t off, ++ enum fuse_fill_dir_flags flags); ++/** ++ * Configuration of the high-level API ++ * ++ * This structure is initialized from the arguments passed to ++ * fuse_new(), and then passed to the file system's init() handler ++ * which should ensure that the configuration is compatible with the ++ * file system implementation. ++ */ ++struct fuse_config { ++ /** ++ * If `set_gid` is non-zero, the st_gid attribute of each file ++ * is overwritten with the value of `gid`. ++ */ ++ int set_gid; ++ unsigned int gid; ++ ++ /** ++ * If `set_uid` is non-zero, the st_uid attribute of each file ++ * is overwritten with the value of `uid`. ++ */ ++ int set_uid; ++ unsigned int uid; ++ ++ /** ++ * If `set_mode` is non-zero, the any permissions bits set in ++ * `umask` are unset in the st_mode attribute of each file. ++ */ ++ int set_mode; ++ unsigned int umask; ++ ++ /** ++ * The timeout in seconds for which name lookups will be ++ * cached. ++ */ ++ double entry_timeout; ++ ++ /** ++ * The timeout in seconds for which a negative lookup will be ++ * cached. This means, that if file did not exist (lookup ++ * retuned ENOENT), the lookup will only be redone after the ++ * timeout, and the file/directory will be assumed to not ++ * exist until then. A value of zero means that negative ++ * lookups are not cached. ++ */ ++ double negative_timeout; ++ ++ /** ++ * The timeout in seconds for which file/directory attributes ++ * (as returned by e.g. the `getattr` handler) are cached. ++ */ ++ double attr_timeout; ++ ++ /** ++ * Allow requests to be interrupted ++ */ ++ int intr; ++ ++ /** ++ * Specify which signal number to send to the filesystem when ++ * a request is interrupted. The default is hardcoded to ++ * USR1. ++ */ ++ int intr_signal; ++ ++ /** ++ * Normally, FUSE assigns inodes to paths only for as long as ++ * the kernel is aware of them. With this option inodes are ++ * instead remembered for at least this many seconds. This ++ * will require more memory, but may be necessary when using ++ * applications that make use of inode numbers. ++ * ++ * A number of -1 means that inodes will be remembered for the ++ * entire life-time of the file-system process. ++ */ ++ int remember; ++ ++ /** ++ * The default behavior is that if an open file is deleted, ++ * the file is renamed to a hidden file (.fuse_hiddenXXX), and ++ * only removed when the file is finally released. This ++ * relieves the filesystem implementation of having to deal ++ * with this problem. This option disables the hiding ++ * behavior, and files are removed immediately in an unlink ++ * operation (or in a rename operation which overwrites an ++ * existing file). ++ * ++ * It is recommended that you not use the hard_remove ++ * option. When hard_remove is set, the following libc ++ * functions fail on unlinked files (returning errno of ++ * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), ++ * ftruncate(2), fstat(2), fchmod(2), fchown(2) ++ */ ++ int hard_remove; ++ ++ /** ++ * Honor the st_ino field in the functions getattr() and ++ * fill_dir(). This value is used to fill in the st_ino field ++ * in the stat(2), lstat(2), fstat(2) functions and the d_ino ++ * field in the readdir(2) function. The filesystem does not ++ * have to guarantee uniqueness, however some applications ++ * rely on this value being unique for the whole filesystem. ++ * ++ * Note that this does *not* affect the inode that libfuse ++ * and the kernel use internally (also called the "nodeid"). ++ */ ++ int use_ino; ++ ++ /** ++ * If use_ino option is not given, still try to fill in the ++ * d_ino field in readdir(2). If the name was previously ++ * looked up, and is still in the cache, the inode number ++ * found there will be used. Otherwise it will be set to -1. ++ * If use_ino option is given, this option is ignored. ++ */ ++ int readdir_ino; ++ ++ /** ++ * This option disables the use of page cache (file content cache) ++ * in the kernel for this filesystem. This has several affects: ++ * ++ * 1. Each read(2) or write(2) system call will initiate one ++ * or more read or write operations, data will not be ++ * cached in the kernel. ++ * ++ * 2. The return value of the read() and write() system calls ++ * will correspond to the return values of the read and ++ * write operations. This is useful for example if the ++ * file size is not known in advance (before reading it). ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `direct_io` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int direct_io; ++ ++ /** ++ * This option disables flushing the cache of the file ++ * contents on every open(2). This should only be enabled on ++ * filesystems where the file data is never changed ++ * externally (not through the mounted FUSE filesystem). Thus ++ * it is not suitable for network filesystems and other ++ * intermediate filesystems. ++ * ++ * NOTE: if this option is not specified (and neither ++ * direct_io) data is still cached after the open(2), so a ++ * read(2) system call will not always initiate a read ++ * operation. ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `keep_cache` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int kernel_cache; ++ ++ /** ++ * This option is an alternative to `kernel_cache`. Instead of ++ * unconditionally keeping cached data, the cached data is ++ * invalidated on open(2) if if the modification time or the ++ * size of the file has changed since it was last opened. ++ */ ++ int auto_cache; ++ ++ /** ++ * The timeout in seconds for which file attributes are cached ++ * for the purpose of checking if auto_cache should flush the ++ * file data on open. ++ */ ++ int ac_attr_timeout_set; ++ double ac_attr_timeout; ++ ++ /** ++ * If this option is given the file-system handlers for the ++ * following operations will not receive path information: ++ * read, write, flush, release, fsync, readdir, releasedir, ++ * fsyncdir, lock, ioctl and poll. ++ * ++ * For the truncate, getattr, chmod, chown and utimens ++ * operations the path will be provided only if the struct ++ * fuse_file_info argument is NULL. ++ */ ++ int nullpath_ok; ++ ++ /** ++ * The remaining options are used by libfuse internally and ++ * should not be touched. ++ */ ++ int show_help; ++ char *modules; ++ int debug; ++}; ++ ++ ++/** ++ * The file system operations: ++ * ++ * Most of these should work very similarly to the well known UNIX ++ * file system operations. A major exception is that instead of ++ * returning an error in 'errno', the operation should return the ++ * negated error value (-errno) directly. ++ * ++ * All methods are optional, but some are essential for a useful ++ * filesystem (e.g. getattr). Open, flush, release, fsync, opendir, ++ * releasedir, fsyncdir, access, create, truncate, lock, init and ++ * destroy are special purpose methods, without which a full featured ++ * filesystem can still be implemented. ++ * ++ * In general, all methods are expected to perform any necessary ++ * permission checking. However, a filesystem may delegate this task ++ * to the kernel by passing the `default_permissions` mount option to ++ * `fuse_new()`. In this case, methods will only be called if ++ * the kernel's permission check has succeeded. ++ * ++ * Almost all operations take a path which can be of any length. ++ */ ++struct fuse_operations { ++ /** Get file attributes. ++ * ++ * Similar to stat(). The 'st_dev' and 'st_blksize' fields are ++ * ignored. The 'st_ino' field is ignored except if the 'use_ino' ++ * mount option is given. In that case it is passed to userspace, ++ * but libfuse and the kernel will still assign a different ++ * inode for internal use (called the "nodeid"). ++ * ++ * `fi` will always be NULL if the file is not currently open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*getattr) (const char *, struct stat *, struct fuse_file_info *fi); ++ ++ /** Read the target of a symbolic link ++ * ++ * The buffer should be filled with a null terminated string. The ++ * buffer size argument includes the space for the terminating ++ * null character. If the linkname is too long to fit in the ++ * buffer, it should be truncated. The return value should be 0 ++ * for success. ++ */ ++ int (*readlink) (const char *, char *, size_t); ++ ++ /** Create a file node ++ * ++ * This is called for creation of all non-directory, non-symlink ++ * nodes. If the filesystem defines a create() method, then for ++ * regular files that will be called instead. ++ */ ++ int (*mknod) (const char *, mode_t, dev_t); ++ ++ /** Create a directory ++ * ++ * Note that the mode argument may not have the type specification ++ * bits set, i.e. S_ISDIR(mode) can be false. To obtain the ++ * correct directory type bits use mode|S_IFDIR ++ * */ ++ int (*mkdir) (const char *, mode_t); ++ ++ /** Remove a file */ ++ int (*unlink) (const char *); ++ ++ /** Remove a directory */ ++ int (*rmdir) (const char *); ++ ++ /** Create a symbolic link */ ++ int (*symlink) (const char *, const char *); ++ ++ /** Rename a file ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ */ ++ int (*rename) (const char *, const char *, unsigned int flags); ++ ++ /** Create a hard link to a file */ ++ int (*link) (const char *, const char *); ++ ++ /** Change the permission bits of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*chmod) (const char *, mode_t, struct fuse_file_info *fi); ++ ++ /** Change the owner and group of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*chown) (const char *, uid_t, gid_t, struct fuse_file_info *fi); ++ ++ /** Change the size of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*truncate) (const char *, off_t, struct fuse_file_info *fi); ++ ++ /** Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) ++ * should be used by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount option is ++ * given, this check is already done by the kernel before calling ++ * open() and may thus be omitted by the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open will also succeed without being send ++ * to the filesystem process. ++ * ++ */ ++ int (*open) (const char *, struct fuse_file_info *); ++ ++ /** Read data from an open file ++ * ++ * Read should return exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the ++ * 'direct_io' mount option is specified, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ */ ++ int (*read) (const char *, char *, size_t, off_t, ++ struct fuse_file_info *); ++ ++ /** Write data to an open file ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the 'direct_io' ++ * mount option is specified (see read operation). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write) (const char *, const char *, size_t, off_t, ++ struct fuse_file_info *); ++ ++ /** Get file system statistics ++ * ++ * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored ++ */ ++ int (*statfs) (const char *, struct statvfs *); ++ ++ /** Possibly flush cached data ++ * ++ * BIG NOTE: This is not equivalent to fsync(). It's not a ++ * request to sync dirty data. ++ * ++ * Flush is called on each close() of a file descriptor, as opposed to ++ * release which is called on the close of the last file descriptor for ++ * a file. Under Linux, errors returned by flush() will be passed to ++ * userspace as errors from close(), so flush() is a good place to write ++ * back any cached dirty data. However, many applications ignore errors ++ * on close(), and on non-Linux systems, close() may succeed even if flush() ++ * returns an error. For these reasons, filesystems should not assume ++ * that errors returned by flush will ever be noticed or even ++ * delivered. ++ * ++ * NOTE: The flush() method may be called more than once for each ++ * open(). This happens if more than one file descriptor refers to an ++ * open file handle, e.g. due to dup(), dup2() or fork() calls. It is ++ * not possible to determine if a flush is final, so each flush should ++ * be treated equally. Multiple write-flush sequences are relatively ++ * rare, so this shouldn't be a problem. ++ * ++ * Filesystems shouldn't assume that flush will be called at any ++ * particular point. It may be called more times than expected, or not ++ * at all. ++ * ++ * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ int (*flush) (const char *, struct fuse_file_info *); ++ ++ /** Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open() call there will be exactly one release() call ++ * with the same flags and file handle. It is possible to ++ * have a file opened more than once, in which case only the last ++ * release will mean, that no more reads/writes will happen on the ++ * file. The return value of release is ignored. ++ */ ++ int (*release) (const char *, struct fuse_file_info *); ++ ++ /** Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ */ ++ int (*fsync) (const char *, int, struct fuse_file_info *); ++ ++ /** Set extended attributes */ ++ int (*setxattr) (const char *, const char *, const char *, size_t, int); ++ ++ /** Get extended attributes */ ++ int (*getxattr) (const char *, const char *, char *, size_t); ++ ++ /** List extended attributes */ ++ int (*listxattr) (const char *, char *, size_t); ++ ++ /** Remove extended attributes */ ++ int (*removexattr) (const char *, const char *); ++ ++ /** Open directory ++ * ++ * Unless the 'default_permissions' mount option is given, ++ * this method should check if opendir is permitted for this ++ * directory. Optionally opendir may also return an arbitrary ++ * filehandle in the fuse_file_info structure, which will be ++ * passed to readdir, releasedir and fsyncdir. ++ */ ++ int (*opendir) (const char *, struct fuse_file_info *); ++ ++ /** Read directory ++ * ++ * The filesystem may choose between two modes of operation: ++ * ++ * 1) The readdir implementation ignores the offset parameter, and ++ * passes zero to the filler function's offset. The filler ++ * function will not return '1' (unless an error happens), so the ++ * whole directory is read in a single readdir operation. ++ * ++ * 2) The readdir implementation keeps track of the offsets of the ++ * directory entries. It uses the offset parameter and always ++ * passes non-zero offset to the filler function. When the buffer ++ * is full (or an error happens) the filler function will return ++ * '1'. ++ */ ++ int (*readdir) (const char *, void *, fuse_fill_dir_t, off_t, ++ struct fuse_file_info *, enum fuse_readdir_flags); ++ ++ /** Release directory ++ */ ++ int (*releasedir) (const char *, struct fuse_file_info *); ++ ++ /** Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data ++ */ ++ int (*fsyncdir) (const char *, int, struct fuse_file_info *); ++ ++ /** ++ * Initialize filesystem ++ * ++ * The return value will passed in the `private_data` field of ++ * `struct fuse_context` to all file operations, and as a ++ * parameter to the destroy() method. It overrides the initial ++ * value provided to fuse_main() / fuse_new(). ++ */ ++ void *(*init) (struct fuse_conn_info *conn, ++ struct fuse_config *cfg); ++ ++ /** ++ * Clean up filesystem ++ * ++ * Called on filesystem exit. ++ */ ++ void (*destroy) (void *private_data); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() system call. If the ++ * 'default_permissions' mount option is given, this method is not ++ * called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ */ ++ int (*access) (const char *, int); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ */ ++ int (*create) (const char *, mode_t, struct fuse_file_info *); ++ ++ /** ++ * Perform POSIX file locking operation ++ * ++ * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. ++ * ++ * For the meaning of fields in 'struct flock' see the man page ++ * for fcntl(2). The l_whence field will always be set to ++ * SEEK_SET. ++ * ++ * For checking lock ownership, the 'fuse_file_info->owner' ++ * argument must be used. ++ * ++ * For F_GETLK operation, the library will first check currently ++ * held locks, and if a conflicting lock is found it will return ++ * information without calling this method. This ensures, that ++ * for local locks the l_pid field is correctly filled in. The ++ * results may not be accurate in case of race conditions and in ++ * the presence of hard links, but it's unlikely that an ++ * application would rely on accurate GETLK results in these ++ * cases. If a conflicting lock is not found, this method will be ++ * called, and the filesystem may fill out l_pid by a meaningful ++ * value, or it may leave this field zero. ++ * ++ * For F_SETLK and F_SETLKW the l_pid field will be set to the pid ++ * of the process performing the locking operation. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*lock) (const char *, struct fuse_file_info *, int cmd, ++ struct flock *); ++ ++ /** ++ * Change the access and modification times of a file with ++ * nanosecond resolution ++ * ++ * This supersedes the old utime() interface. New applications ++ * should use this. ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * See the utimensat(2) man page for details. ++ */ ++ int (*utimens) (const char *, const struct timespec tv[2], ++ struct fuse_file_info *fi); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ */ ++ int (*bmap) (const char *, size_t blocksize, uint64_t *idx); ++ ++ /** ++ * Ioctl ++ * ++ * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in ++ * 64bit environment. The size and direction of data is ++ * determined by _IOC_*() decoding of cmd. For _IOC_NONE, ++ * data will be NULL, for _IOC_WRITE data is out area, for ++ * _IOC_READ in area and if both are set in/out area. In all ++ * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. ++ * ++ * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a ++ * directory file handle. ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ int (*ioctl) (const char *, unsigned int cmd, void *arg, ++ struct fuse_file_info *, unsigned int flags, void *data); ++ ++ /** ++ * Poll for IO readiness events ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ */ ++ int (*poll) (const char *, struct fuse_file_info *, ++ struct fuse_pollhandle *ph, unsigned *reventsp); ++ ++ /** Write contents of buffer to an open file ++ * ++ * Similar to the write() method, but data is supplied in a ++ * generic buffer. Use fuse_buf_copy() to transfer data to ++ * the destination. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write_buf) (const char *, struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *); ++ ++ /** Store data from an open file in a buffer ++ * ++ * Similar to the read() method, but data is stored and ++ * returned in a generic buffer. ++ * ++ * No actual copying of data has to take place, the source ++ * file descriptor may simply be stored in the buffer for ++ * later data transfer. ++ * ++ * The buffer must be allocated dynamically and stored at the ++ * location pointed to by bufp. If the buffer contains memory ++ * regions, they too must be allocated using malloc(). The ++ * allocated memory will be freed by the caller. ++ */ ++ int (*read_buf) (const char *, struct fuse_bufvec **bufp, ++ size_t size, off_t off, struct fuse_file_info *); ++ /** ++ * Perform BSD file locking operation ++ * ++ * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN ++ * ++ * Nonblocking requests will be indicated by ORing LOCK_NB to ++ * the above operations ++ * ++ * For more information see the flock(2) manual page. ++ * ++ * Additionally fi->owner will be set to a value unique to ++ * this open file. This same value will be supplied to ++ * ->release() when the file is released. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*flock) (const char *, struct fuse_file_info *, int op); ++ ++ /** ++ * Allocates space for an open file ++ * ++ * This function ensures that required space is allocated for specified ++ * file. If this function returns success then any subsequent write ++ * request to specified range is guaranteed not to fail because of lack ++ * of space on the file system media. ++ */ ++ int (*fallocate) (const char *, int, off_t, off_t, ++ struct fuse_file_info *); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ */ ++ ssize_t (*copy_file_range) (const char *path_in, ++ struct fuse_file_info *fi_in, ++ off_t offset_in, const char *path_out, ++ struct fuse_file_info *fi_out, ++ off_t offset_out, size_t size, int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ */ ++ off_t (*lseek) (const char *, off_t off, int whence, struct fuse_file_info *); ++}; ++ ++/** Extra context that may be needed by some filesystems ++ * ++ * The uid, gid and pid fields are not filled in case of a writepage ++ * operation. ++ */ ++struct fuse_context { ++ /** Pointer to the fuse object */ ++ struct fuse *fuse; ++ ++ /** User ID of the calling process */ ++ uid_t uid; ++ ++ /** Group ID of the calling process */ ++ gid_t gid; ++ ++ /** Process ID of the calling thread */ ++ pid_t pid; ++ ++ /** Private filesystem data */ ++ void *private_data; ++ ++ /** Umask of the calling process */ ++ mode_t umask; ++}; ++ ++/** ++ * Main function of FUSE. ++ * ++ * This is for the lazy. This is all that has to be called from the ++ * main() function. ++ * ++ * This function does the following: ++ * - parses command line options, and handles --help and ++ * --version ++ * - installs signal handlers for INT, HUP, TERM and PIPE ++ * - registers an exit handler to unmount the filesystem on program exit ++ * - creates a fuse handle ++ * - registers the operations ++ * - calls either the single-threaded or the multi-threaded event loop ++ * ++ * Most file systems will have to parse some file-system specific ++ * arguments before calling this function. It is recommended to do ++ * this with fuse_opt_parse() and a processing function that passes ++ * through any unknown options (this can also be achieved by just ++ * passing NULL as the processing function). That way, the remaining ++ * options can be passed directly to fuse_main(). ++ * ++ * fuse_main() accepts all options that can be passed to ++ * fuse_parse_cmdline(), fuse_new(), or fuse_session_new(). ++ * ++ * Option parsing skips argv[0], which is assumed to contain the ++ * program name. This element must always be present and is used to ++ * construct a basic ``usage: `` message for the --help ++ * output. argv[0] may also be set to the empty string. In this case ++ * the usage message is suppressed. This can be used by file systems ++ * to print their own usage line first. See hello.c for an example of ++ * how to do this. ++ * ++ * Note: this is currently implemented as a macro. ++ * ++ * The following error codes may be returned from fuse_main(): ++ * 1: Invalid option arguments ++ * 2: No mount point specified ++ * 3: FUSE setup failed ++ * 4: Mounting failed ++ * 5: Failed to daemonize (detach from session) ++ * 6: Failed to set up signal handlers ++ * 7: An error occured during the life of the file system ++ * ++ * @param argc the argument counter passed to the main() function ++ * @param argv the argument vector passed to the main() function ++ * @param op the file system operation ++ * @param private_data Initial value for the `private_data` ++ * field of `struct fuse_context`. May be overridden by the ++ * `struct fuse_operations.init` handler. ++ * @return 0 on success, nonzero on failure ++ * ++ * Example usage, see hello.c ++ */ ++/* ++ int fuse_main(int argc, char *argv[], const struct fuse_operations *op, ++ void *private_data); ++*/ ++#define fuse_main(argc, argv, op, private_data) \ ++ fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) ++ ++/* ----------------------------------------------------------- * ++ * More detailed API * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Print available options (high- and low-level) to stdout. This is ++ * not an exhaustive list, but includes only those options that may be ++ * of interest to an end-user of a file system. ++ * ++ * The function looks at the argument vector only to determine if ++ * there are additional modules to be loaded (module=foo option), ++ * and attempts to call their help functions as well. ++ * ++ * @param args the argument vector. ++ */ ++void fuse_lib_help(struct fuse_args *args); ++ ++/** ++ * Create a new FUSE filesystem. ++ * ++ * This function accepts most file-system independent mount options ++ * (like context, nodev, ro - see mount(8)), as well as the ++ * FUSE-specific mount options from mount.fuse(8). ++ * ++ * If the --help option is specified, the function writes a help text ++ * to stdout and returns NULL. ++ * ++ * Option parsing skips argv[0], which is assumed to contain the ++ * program name. This element must always be present and is used to ++ * construct a basic ``usage: `` message for the --help output. If ++ * argv[0] is set to the empty string, no usage message is included in ++ * the --help output. ++ * ++ * If an unknown option is passed in, an error message is written to ++ * stderr and the function returns NULL. ++ * ++ * @param args argument vector ++ * @param op the filesystem operations ++ * @param op_size the size of the fuse_operations structure ++ * @param private_data Initial value for the `private_data` ++ * field of `struct fuse_context`. May be overridden by the ++ * `struct fuse_operations.init` handler. ++ * @return the created FUSE handle ++ */ ++#if FUSE_USE_VERSION == 30 ++struct fuse *fuse_new_30(struct fuse_args *args, const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++#define fuse_new(args, op, size, data) fuse_new_30(args, op, size, data) ++#else ++struct fuse *fuse_new(struct fuse_args *args, const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++#endif ++ ++/** ++ * Mount a FUSE file system. ++ * ++ * @param mountpoint the mount point path ++ * @param f the FUSE handle ++ * ++ * @return 0 on success, -1 on failure. ++ **/ ++int fuse_mount(struct fuse *f, const char *mountpoint); ++ ++/** ++ * Unmount a FUSE file system. ++ * ++ * See fuse_session_unmount() for additional information. ++ * ++ * @param f the FUSE handle ++ **/ ++void fuse_unmount(struct fuse *f); ++ ++/** ++ * Destroy the FUSE handle. ++ * ++ * NOTE: This function does not unmount the filesystem. If this is ++ * needed, call fuse_unmount() before calling this function. ++ * ++ * @param f the FUSE handle ++ */ ++void fuse_destroy(struct fuse *f); ++ ++/** ++ * FUSE event loop. ++ * ++ * Requests from the kernel are processed, and the appropriate ++ * operations are called. ++ * ++ * For a description of the return value and the conditions when the ++ * event loop exits, refer to the documentation of ++ * fuse_session_loop(). ++ * ++ * @param f the FUSE handle ++ * @return see fuse_session_loop() ++ * ++ * See also: fuse_loop_mt() ++ */ ++int fuse_loop(struct fuse *f); ++ ++/** ++ * Flag session as terminated ++ * ++ * This function will cause any running event loops to exit on ++ * the next opportunity. ++ * ++ * @param f the FUSE handle ++ */ ++void fuse_exit(struct fuse *f); ++ ++/** ++ * FUSE event loop with multiple threads ++ * ++ * Requests from the kernel are processed, and the appropriate ++ * operations are called. Request are processed in parallel by ++ * distributing them between multiple threads. ++ * ++ * For a description of the return value and the conditions when the ++ * event loop exits, refer to the documentation of ++ * fuse_session_loop(). ++ * ++ * Note: using fuse_loop() instead of fuse_loop_mt() means you are running in ++ * single-threaded mode, and that you will not have to worry about reentrancy, ++ * though you will have to worry about recursive lookups. In single-threaded ++ * mode, FUSE will wait for one callback to return before calling another. ++ * ++ * Enabling multiple threads, by using fuse_loop_mt(), will cause FUSE to make ++ * multiple simultaneous calls into the various callback functions given by your ++ * fuse_operations record. ++ * ++ * If you are using multiple threads, you can enjoy all the parallel execution ++ * and interactive response benefits of threads, and you get to enjoy all the ++ * benefits of race conditions and locking bugs, too. Ensure that any code used ++ * in the callback function of fuse_operations is also thread-safe. ++ * ++ * @param f the FUSE handle ++ * @param config loop configuration ++ * @return see fuse_session_loop() ++ * ++ * See also: fuse_loop() ++ */ ++#if FUSE_USE_VERSION < 32 ++int fuse_loop_mt_31(struct fuse *f, int clone_fd); ++#define fuse_loop_mt(f, clone_fd) fuse_loop_mt_31(f, clone_fd) ++#else ++int fuse_loop_mt(struct fuse *f, struct fuse_loop_config *config); ++#endif ++ ++/** ++ * Get the current context ++ * ++ * The context is only valid for the duration of a filesystem ++ * operation, and thus must not be stored and used later. ++ * ++ * @return the context ++ */ ++struct fuse_context *fuse_get_context(void); ++ ++/** ++ * Get the current supplementary group IDs for the current request ++ * ++ * Similar to the getgroups(2) system call, except the return value is ++ * always the total number of group IDs, even if it is larger than the ++ * specified size. ++ * ++ * The current fuse kernel module in linux (as of 2.6.30) doesn't pass ++ * the group list to userspace, hence this function needs to parse ++ * "/proc/$TID/task/$TID/status" to get the group IDs. ++ * ++ * This feature may not be supported on all operating systems. In ++ * such a case this function will return -ENOSYS. ++ * ++ * @param size size of given array ++ * @param list array of group IDs to be filled in ++ * @return the total number of supplementary group IDs or -errno on failure ++ */ ++int fuse_getgroups(int size, gid_t list[]); ++ ++/** ++ * Check if the current request has already been interrupted ++ * ++ * @return 1 if the request has been interrupted, 0 otherwise ++ */ ++int fuse_interrupted(void); ++ ++/** ++ * Invalidates cache for the given path. ++ * ++ * This calls fuse_lowlevel_notify_inval_inode internally. ++ * ++ * @return 0 on successful invalidation, negative error value otherwise. ++ * This routine may return -ENOENT to indicate that there was ++ * no entry to be invalidated, e.g., because the path has not ++ * been seen before or has been forgotten; this should not be ++ * considered to be an error. ++ */ ++int fuse_invalidate_path(struct fuse *f, const char *path); ++ ++/** ++ * The real main function ++ * ++ * Do not call this directly, use fuse_main() ++ */ ++int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++ ++/** ++ * Start the cleanup thread when using option "remember". ++ * ++ * This is done automatically by fuse_loop_mt() ++ * @param fuse struct fuse pointer for fuse instance ++ * @return 0 on success and -1 on error ++ */ ++int fuse_start_cleanup_thread(struct fuse *fuse); ++ ++/** ++ * Stop the cleanup thread when using option "remember". ++ * ++ * This is done automatically by fuse_loop_mt() ++ * @param fuse struct fuse pointer for fuse instance ++ */ ++void fuse_stop_cleanup_thread(struct fuse *fuse); ++ ++/** ++ * Iterate over cache removing stale entries ++ * use in conjunction with "-oremember" ++ * ++ * NOTE: This is already done for the standard sessions ++ * ++ * @param fuse struct fuse pointer for fuse instance ++ * @return the number of seconds until the next cleanup ++ */ ++int fuse_clean_cache(struct fuse *fuse); ++ ++/* ++ * Stacking API ++ */ ++ ++/** ++ * Fuse filesystem object ++ * ++ * This is opaque object represents a filesystem layer ++ */ ++struct fuse_fs; ++ ++/* ++ * These functions call the relevant filesystem operation, and return ++ * the result. ++ * ++ * If the operation is not defined, they return -ENOSYS, with the ++ * exception of fuse_fs_open, fuse_fs_release, fuse_fs_opendir, ++ * fuse_fs_releasedir and fuse_fs_statfs, which return 0. ++ */ ++ ++int fuse_fs_getattr(struct fuse_fs *fs, const char *path, struct stat *buf, ++ struct fuse_file_info *fi); ++int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, ++ const char *newpath, unsigned int flags); ++int fuse_fs_unlink(struct fuse_fs *fs, const char *path); ++int fuse_fs_rmdir(struct fuse_fs *fs, const char *path); ++int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, ++ const char *path); ++int fuse_fs_link(struct fuse_fs *fs, const char *oldpath, const char *newpath); ++int fuse_fs_release(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_open(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_read(struct fuse_fs *fs, const char *path, char *buf, size_t size, ++ off_t off, struct fuse_file_info *fi); ++int fuse_fs_read_buf(struct fuse_fs *fs, const char *path, ++ struct fuse_bufvec **bufp, size_t size, off_t off, ++ struct fuse_file_info *fi); ++int fuse_fs_write(struct fuse_fs *fs, const char *path, const char *buf, ++ size_t size, off_t off, struct fuse_file_info *fi); ++int fuse_fs_write_buf(struct fuse_fs *fs, const char *path, ++ struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *fi); ++int fuse_fs_fsync(struct fuse_fs *fs, const char *path, int datasync, ++ struct fuse_file_info *fi); ++int fuse_fs_flush(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_statfs(struct fuse_fs *fs, const char *path, struct statvfs *buf); ++int fuse_fs_opendir(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_readdir(struct fuse_fs *fs, const char *path, void *buf, ++ fuse_fill_dir_t filler, off_t off, ++ struct fuse_file_info *fi, enum fuse_readdir_flags flags); ++int fuse_fs_fsyncdir(struct fuse_fs *fs, const char *path, int datasync, ++ struct fuse_file_info *fi); ++int fuse_fs_releasedir(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_create(struct fuse_fs *fs, const char *path, mode_t mode, ++ struct fuse_file_info *fi); ++int fuse_fs_lock(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi, int cmd, struct flock *lock); ++int fuse_fs_flock(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi, int op); ++int fuse_fs_chmod(struct fuse_fs *fs, const char *path, mode_t mode, ++ struct fuse_file_info *fi); ++int fuse_fs_chown(struct fuse_fs *fs, const char *path, uid_t uid, gid_t gid, ++ struct fuse_file_info *fi); ++int fuse_fs_truncate(struct fuse_fs *fs, const char *path, off_t size, ++ struct fuse_file_info *fi); ++int fuse_fs_utimens(struct fuse_fs *fs, const char *path, ++ const struct timespec tv[2], struct fuse_file_info *fi); ++int fuse_fs_access(struct fuse_fs *fs, const char *path, int mask); ++int fuse_fs_readlink(struct fuse_fs *fs, const char *path, char *buf, ++ size_t len); ++int fuse_fs_mknod(struct fuse_fs *fs, const char *path, mode_t mode, ++ dev_t rdev); ++int fuse_fs_mkdir(struct fuse_fs *fs, const char *path, mode_t mode); ++int fuse_fs_setxattr(struct fuse_fs *fs, const char *path, const char *name, ++ const char *value, size_t size, int flags); ++int fuse_fs_getxattr(struct fuse_fs *fs, const char *path, const char *name, ++ char *value, size_t size); ++int fuse_fs_listxattr(struct fuse_fs *fs, const char *path, char *list, ++ size_t size); ++int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, ++ const char *name); ++int fuse_fs_bmap(struct fuse_fs *fs, const char *path, size_t blocksize, ++ uint64_t *idx); ++int fuse_fs_ioctl(struct fuse_fs *fs, const char *path, unsigned int cmd, ++ void *arg, struct fuse_file_info *fi, unsigned int flags, ++ void *data); ++int fuse_fs_poll(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi, struct fuse_pollhandle *ph, ++ unsigned *reventsp); ++int fuse_fs_fallocate(struct fuse_fs *fs, const char *path, int mode, ++ off_t offset, off_t length, struct fuse_file_info *fi); ++ssize_t fuse_fs_copy_file_range(struct fuse_fs *fs, const char *path_in, ++ struct fuse_file_info *fi_in, off_t off_in, ++ const char *path_out, ++ struct fuse_file_info *fi_out, off_t off_out, ++ size_t len, int flags); ++off_t fuse_fs_lseek(struct fuse_fs *fs, const char *path, off_t off, int whence, ++ struct fuse_file_info *fi); ++void fuse_fs_init(struct fuse_fs *fs, struct fuse_conn_info *conn, ++ struct fuse_config *cfg); ++void fuse_fs_destroy(struct fuse_fs *fs); ++ ++int fuse_notify_poll(struct fuse_pollhandle *ph); ++ ++/** ++ * Create a new fuse filesystem object ++ * ++ * This is usually called from the factory of a fuse module to create ++ * a new instance of a filesystem. ++ * ++ * @param op the filesystem operations ++ * @param op_size the size of the fuse_operations structure ++ * @param private_data Initial value for the `private_data` ++ * field of `struct fuse_context`. May be overridden by the ++ * `struct fuse_operations.init` handler. ++ * @return a new filesystem object ++ */ ++struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, ++ void *private_data); ++ ++/** ++ * Factory for creating filesystem objects ++ * ++ * The function may use and remove options from 'args' that belong ++ * to this module. ++ * ++ * For now the 'fs' vector always contains exactly one filesystem. ++ * This is the filesystem which will be below the newly created ++ * filesystem in the stack. ++ * ++ * @param args the command line arguments ++ * @param fs NULL terminated filesystem object vector ++ * @return the new filesystem object ++ */ ++typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, ++ struct fuse_fs *fs[]); ++/** ++ * Register filesystem module ++ * ++ * If the "-omodules=*name*_:..." option is present, filesystem ++ * objects are created and pushed onto the stack with the *factory_* ++ * function. ++ * ++ * @param name_ the name of this filesystem module ++ * @param factory_ the factory function for this filesystem module ++ */ ++#define FUSE_REGISTER_MODULE(name_, factory_) \ ++ fuse_module_factory_t fuse_module_ ## name_ ## _factory = factory_ ++ ++/** Get session from fuse object */ ++struct fuse_session *fuse_get_session(struct fuse *f); ++ ++/** ++ * Open a FUSE file descriptor and set up the mount for the given ++ * mountpoint and flags. ++ * ++ * @param mountpoint reference to the mount in the file system ++ * @param options mount options ++ * @return the FUSE file descriptor or -1 upon error ++ */ ++int fuse_open_channel(const char *mountpoint, const char *options); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_H_ */ +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +new file mode 100644 +index 0000000..2d686b2 +--- /dev/null ++++ b/tools/virtiofsd/fuse_common.h +@@ -0,0 +1,823 @@ ++/* FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++/** @file */ ++ ++#if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_) ++#error "Never include directly; use or instead." ++#endif ++ ++#ifndef FUSE_COMMON_H_ ++#define FUSE_COMMON_H_ ++ ++#include "fuse_opt.h" ++#include "fuse_log.h" ++#include ++#include ++ ++/** Major version of FUSE library interface */ ++#define FUSE_MAJOR_VERSION 3 ++ ++/** Minor version of FUSE library interface */ ++#define FUSE_MINOR_VERSION 2 ++ ++#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) ++#define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Information about an open file. ++ * ++ * File Handles are created by the open, opendir, and create methods and closed ++ * by the release and releasedir methods. Multiple file handles may be ++ * concurrently open for the same file. Generally, a client will create one ++ * file handle per file descriptor, though in some cases multiple file ++ * descriptors can share a single file handle. ++ */ ++struct fuse_file_info { ++ /** Open flags. Available in open() and release() */ ++ int flags; ++ ++ /** In case of a write operation indicates if this was caused ++ by a delayed write from the page cache. If so, then the ++ context's pid, uid, and gid fields will not be valid, and ++ the *fh* value may not match the *fh* value that would ++ have been sent with the corresponding individual write ++ requests if write caching had been disabled. */ ++ unsigned int writepage : 1; ++ ++ /** Can be filled in by open, to use direct I/O on this file. */ ++ unsigned int direct_io : 1; ++ ++ /** Can be filled in by open. It signals the kernel that any ++ currently cached file data (ie., data that the filesystem ++ provided the last time the file was open) need not be ++ invalidated. Has no effect when set in other contexts (in ++ particular it does nothing when set by opendir()). */ ++ unsigned int keep_cache : 1; ++ ++ /** Indicates a flush operation. Set in flush operation, also ++ maybe set in highlevel lock operation and lowlevel release ++ operation. */ ++ unsigned int flush : 1; ++ ++ /** Can be filled in by open, to indicate that the file is not ++ seekable. */ ++ unsigned int nonseekable : 1; ++ ++ /* Indicates that flock locks for this file should be ++ released. If set, lock_owner shall contain a valid value. ++ May only be set in ->release(). */ ++ unsigned int flock_release : 1; ++ ++ /** Can be filled in by opendir. It signals the kernel to ++ enable caching of entries returned by readdir(). Has no ++ effect when set in other contexts (in particular it does ++ nothing when set by open()). */ ++ unsigned int cache_readdir : 1; ++ ++ /** Padding. Reserved for future use*/ ++ unsigned int padding : 25; ++ unsigned int padding2 : 32; ++ ++ /** File handle id. May be filled in by filesystem in create, ++ * open, and opendir(). Available in most other file operations on the ++ * same file handle. */ ++ uint64_t fh; ++ ++ /** Lock owner id. Available in locking operations and flush */ ++ uint64_t lock_owner; ++ ++ /** Requested poll events. Available in ->poll. Only set on kernels ++ which support it. If unsupported, this field is set to zero. */ ++ uint32_t poll_events; ++}; ++ ++/** ++ * Configuration parameters passed to fuse_session_loop_mt() and ++ * fuse_loop_mt(). ++ */ ++struct fuse_loop_config { ++ /** ++ * whether to use separate device fds for each thread ++ * (may increase performance) ++ */ ++ int clone_fd; ++ ++ /** ++ * The maximum number of available worker threads before they ++ * start to get deleted when they become idle. If not ++ * specified, the default is 10. ++ * ++ * Adjusting this has performance implications; a very small number ++ * of threads in the pool will cause a lot of thread creation and ++ * deletion overhead and performance may suffer. When set to 0, a new ++ * thread will be created to service every operation. ++ */ ++ unsigned int max_idle_threads; ++}; ++ ++/************************************************************************** ++ * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' * ++ **************************************************************************/ ++ ++/** ++ * Indicates that the filesystem supports asynchronous read requests. ++ * ++ * If this capability is not requested/available, the kernel will ++ * ensure that there is at most one pending read request per ++ * file-handle at any time, and will attempt to order read requests by ++ * increasing offset. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_ASYNC_READ (1 << 0) ++ ++/** ++ * Indicates that the filesystem supports "remote" locking. ++ * ++ * This feature is enabled by default when supported by the kernel, ++ * and if getlk() and setlk() handlers are implemented. ++ */ ++#define FUSE_CAP_POSIX_LOCKS (1 << 1) ++ ++/** ++ * Indicates that the filesystem supports the O_TRUNC open flag. If ++ * disabled, and an application specifies O_TRUNC, fuse first calls ++ * truncate() and then open() with O_TRUNC filtered out. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) ++ ++/** ++ * Indicates that the filesystem supports lookups of "." and "..". ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) ++ ++/** ++ * Indicates that the kernel should not apply the umask to the ++ * file mode on create operations. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_DONT_MASK (1 << 6) ++ ++/** ++ * Indicates that libfuse should try to use splice() when writing to ++ * the fuse device. This may improve performance. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_SPLICE_WRITE (1 << 7) ++ ++/** ++ * Indicates that libfuse should try to move pages instead of copying when ++ * writing to / reading from the fuse device. This may improve performance. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_SPLICE_MOVE (1 << 8) ++ ++/** ++ * Indicates that libfuse should try to use splice() when reading from ++ * the fuse device. This may improve performance. ++ * ++ * This feature is enabled by default when supported by the kernel and ++ * if the filesystem implements a write_buf() handler. ++ */ ++#define FUSE_CAP_SPLICE_READ (1 << 9) ++ ++/** ++ * If set, the calls to flock(2) will be emulated using POSIX locks and must ++ * then be handled by the filesystem's setlock() handler. ++ * ++ * If not set, flock(2) calls will be handled by the FUSE kernel module ++ * internally (so any access that does not go through the kernel cannot be taken ++ * into account). ++ * ++ * This feature is enabled by default when supported by the kernel and ++ * if the filesystem implements a flock() handler. ++ */ ++#define FUSE_CAP_FLOCK_LOCKS (1 << 10) ++ ++/** ++ * Indicates that the filesystem supports ioctl's on directories. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_IOCTL_DIR (1 << 11) ++ ++/** ++ * Traditionally, while a file is open the FUSE kernel module only ++ * asks the filesystem for an update of the file's attributes when a ++ * client attempts to read beyond EOF. This is unsuitable for ++ * e.g. network filesystems, where the file contents may change ++ * without the kernel knowing about it. ++ * ++ * If this flag is set, FUSE will check the validity of the attributes ++ * on every read. If the attributes are no longer valid (i.e., if the ++ * *attr_timeout* passed to fuse_reply_attr() or set in `struct ++ * fuse_entry_param` has passed), it will first issue a `getattr` ++ * request. If the new mtime differs from the previous value, any ++ * cached file *contents* will be invalidated as well. ++ * ++ * This flag should always be set when available. If all file changes ++ * go through the kernel, *attr_timeout* should be set to a very large ++ * number to avoid unnecessary getattr() calls. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) ++ ++/** ++ * Indicates that the filesystem supports readdirplus. ++ * ++ * This feature is enabled by default when supported by the kernel and if the ++ * filesystem implements a readdirplus() handler. ++ */ ++#define FUSE_CAP_READDIRPLUS (1 << 13) ++ ++/** ++ * Indicates that the filesystem supports adaptive readdirplus. ++ * ++ * If FUSE_CAP_READDIRPLUS is not set, this flag has no effect. ++ * ++ * If FUSE_CAP_READDIRPLUS is set and this flag is not set, the kernel ++ * will always issue readdirplus() requests to retrieve directory ++ * contents. ++ * ++ * If FUSE_CAP_READDIRPLUS is set and this flag is set, the kernel ++ * will issue both readdir() and readdirplus() requests, depending on ++ * how much information is expected to be required. ++ * ++ * As of Linux 4.20, the algorithm is as follows: when userspace ++ * starts to read directory entries, issue a READDIRPLUS request to ++ * the filesystem. If any entry attributes have been looked up by the ++ * time userspace requests the next batch of entries continue with ++ * READDIRPLUS, otherwise switch to plain READDIR. This will reasult ++ * in eg plain "ls" triggering READDIRPLUS first then READDIR after ++ * that because it doesn't do lookups. "ls -l" should result in all ++ * READDIRPLUS, except if dentries are already cached. ++ * ++ * This feature is enabled by default when supported by the kernel and ++ * if the filesystem implements both a readdirplus() and a readdir() ++ * handler. ++ */ ++#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) ++ ++/** ++ * Indicates that the filesystem supports asynchronous direct I/O submission. ++ * ++ * If this capability is not requested/available, the kernel will ensure that ++ * there is at most one pending read and one pending write request per direct ++ * I/O file-handle at any time. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_ASYNC_DIO (1 << 15) ++ ++/** ++ * Indicates that writeback caching should be enabled. This means that ++ * individual write request may be buffered and merged in the kernel ++ * before they are send to the filesystem. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) ++ ++/** ++ * Indicates support for zero-message opens. If this flag is set in ++ * the `capable` field of the `fuse_conn_info` structure, then the ++ * filesystem may return `ENOSYS` from the open() handler to indicate ++ * success. Further attempts to open files will be handled in the ++ * kernel. (If this flag is not set, returning ENOSYS will be treated ++ * as an error and signaled to the caller). ++ * ++ * Setting (or unsetting) this flag in the `want` field has *no ++ * effect*. ++ */ ++#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) ++ ++/** ++ * Indicates support for parallel directory operations. If this flag ++ * is unset, the FUSE kernel module will ensure that lookup() and ++ * readdir() requests are never issued concurrently for the same ++ * directory. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) ++ ++/** ++ * Indicates support for POSIX ACLs. ++ * ++ * If this feature is enabled, the kernel will cache and have ++ * responsibility for enforcing ACLs. ACL will be stored as xattrs and ++ * passed to userspace, which is responsible for updating the ACLs in ++ * the filesystem, keeping the file mode in sync with the ACL, and ++ * ensuring inheritance of default ACLs when new filesystem nodes are ++ * created. Note that this requires that the file system is able to ++ * parse and interpret the xattr representation of ACLs. ++ * ++ * Enabling this feature implicitly turns on the ++ * ``default_permissions`` mount option (even if it was not passed to ++ * mount(2)). ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_POSIX_ACL (1 << 19) ++ ++/** ++ * Indicates that the filesystem is responsible for unsetting ++ * setuid and setgid bits when a file is written, truncated, or ++ * its owner is changed. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) ++ ++/** ++ * Indicates support for zero-message opendirs. If this flag is set in ++ * the `capable` field of the `fuse_conn_info` structure, then the filesystem ++ * may return `ENOSYS` from the opendir() handler to indicate success. Further ++ * opendir and releasedir messages will be handled in the kernel. (If this ++ * flag is not set, returning ENOSYS will be treated as an error and signalled ++ * to the caller.) ++ * ++ * Setting (or unsetting) this flag in the `want` field has *no effect*. ++ */ ++#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) ++ ++/** ++ * Ioctl flags ++ * ++ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine ++ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed ++ * FUSE_IOCTL_RETRY: retry with new iovecs ++ * FUSE_IOCTL_DIR: is a directory ++ * ++ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs ++ */ ++#define FUSE_IOCTL_COMPAT (1 << 0) ++#define FUSE_IOCTL_UNRESTRICTED (1 << 1) ++#define FUSE_IOCTL_RETRY (1 << 2) ++#define FUSE_IOCTL_DIR (1 << 4) ++ ++#define FUSE_IOCTL_MAX_IOV 256 ++ ++/** ++ * Connection information, passed to the ->init() method ++ * ++ * Some of the elements are read-write, these can be changed to ++ * indicate the value requested by the filesystem. The requested ++ * value must usually be smaller than the indicated value. ++ */ ++struct fuse_conn_info { ++ /** ++ * Major version of the protocol (read-only) ++ */ ++ unsigned proto_major; ++ ++ /** ++ * Minor version of the protocol (read-only) ++ */ ++ unsigned proto_minor; ++ ++ /** ++ * Maximum size of the write buffer ++ */ ++ unsigned max_write; ++ ++ /** ++ * Maximum size of read requests. A value of zero indicates no ++ * limit. However, even if the filesystem does not specify a ++ * limit, the maximum size of read requests will still be ++ * limited by the kernel. ++ * ++ * NOTE: For the time being, the maximum size of read requests ++ * must be set both here *and* passed to fuse_session_new() ++ * using the ``-o max_read=`` mount option. At some point ++ * in the future, specifying the mount option will no longer ++ * be necessary. ++ */ ++ unsigned max_read; ++ ++ /** ++ * Maximum readahead ++ */ ++ unsigned max_readahead; ++ ++ /** ++ * Capability flags that the kernel supports (read-only) ++ */ ++ unsigned capable; ++ ++ /** ++ * Capability flags that the filesystem wants to enable. ++ * ++ * libfuse attempts to initialize this field with ++ * reasonable default values before calling the init() handler. ++ */ ++ unsigned want; ++ ++ /** ++ * Maximum number of pending "background" requests. A ++ * background request is any type of request for which the ++ * total number is not limited by other means. As of kernel ++ * 4.8, only two types of requests fall into this category: ++ * ++ * 1. Read-ahead requests ++ * 2. Asynchronous direct I/O requests ++ * ++ * Read-ahead requests are generated (if max_readahead is ++ * non-zero) by the kernel to preemptively fill its caches ++ * when it anticipates that userspace will soon read more ++ * data. ++ * ++ * Asynchronous direct I/O requests are generated if ++ * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large ++ * direct I/O request. In this case the kernel will internally ++ * split it up into multiple smaller requests and submit them ++ * to the filesystem concurrently. ++ * ++ * Note that the following requests are *not* background ++ * requests: writeback requests (limited by the kernel's ++ * flusher algorithm), regular (i.e., synchronous and ++ * buffered) userspace read/write requests (limited to one per ++ * thread), asynchronous read requests (Linux's io_submit(2) ++ * call actually blocks, so these are also limited to one per ++ * thread). ++ */ ++ unsigned max_background; ++ ++ /** ++ * Kernel congestion threshold parameter. If the number of pending ++ * background requests exceeds this number, the FUSE kernel module will ++ * mark the filesystem as "congested". This instructs the kernel to ++ * expect that queued requests will take some time to complete, and to ++ * adjust its algorithms accordingly (e.g. by putting a waiting thread ++ * to sleep instead of using a busy-loop). ++ */ ++ unsigned congestion_threshold; ++ ++ /** ++ * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible ++ * for updating mtime and ctime when write requests are received. The ++ * updated values are passed to the filesystem with setattr() requests. ++ * However, if the filesystem does not support the full resolution of ++ * the kernel timestamps (nanoseconds), the mtime and ctime values used ++ * by kernel and filesystem will differ (and result in an apparent ++ * change of times after a cache flush). ++ * ++ * To prevent this problem, this variable can be used to inform the ++ * kernel about the timestamp granularity supported by the file-system. ++ * The value should be power of 10. The default is 1, i.e. full ++ * nano-second resolution. Filesystems supporting only second resolution ++ * should set this to 1000000000. ++ */ ++ unsigned time_gran; ++ ++ /** ++ * For future use. ++ */ ++ unsigned reserved[22]; ++}; ++ ++struct fuse_session; ++struct fuse_pollhandle; ++struct fuse_conn_info_opts; ++ ++/** ++ * This function parses several command-line options that can be used ++ * to override elements of struct fuse_conn_info. The pointer returned ++ * by this function should be passed to the ++ * fuse_apply_conn_info_opts() method by the file system's init() ++ * handler. ++ * ++ * Before using this function, think twice if you really want these ++ * parameters to be adjustable from the command line. In most cases, ++ * they should be determined by the file system internally. ++ * ++ * The following options are recognized: ++ * ++ * -o max_write=N sets conn->max_write ++ * -o max_readahead=N sets conn->max_readahead ++ * -o max_background=N sets conn->max_background ++ * -o congestion_threshold=N sets conn->congestion_threshold ++ * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want ++ * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want ++ * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want ++ * -o no_remote_lock Equivalent to -o no_remote_flock,no_remote_posix_lock ++ * -o no_remote_flock Unsets FUSE_CAP_FLOCK_LOCKS in conn->want ++ * -o no_remote_posix_lock Unsets FUSE_CAP_POSIX_LOCKS in conn->want ++ * -o [no_]splice_write (un-)sets FUSE_CAP_SPLICE_WRITE in conn->want ++ * -o [no_]splice_move (un-)sets FUSE_CAP_SPLICE_MOVE in conn->want ++ * -o [no_]splice_read (un-)sets FUSE_CAP_SPLICE_READ in conn->want ++ * -o [no_]auto_inval_data (un-)sets FUSE_CAP_AUTO_INVAL_DATA in conn->want ++ * -o readdirplus=no unsets FUSE_CAP_READDIRPLUS in conn->want ++ * -o readdirplus=yes sets FUSE_CAP_READDIRPLUS and unsets ++ * FUSE_CAP_READDIRPLUS_AUTO in conn->want ++ * -o readdirplus=auto sets FUSE_CAP_READDIRPLUS and ++ * FUSE_CAP_READDIRPLUS_AUTO in conn->want ++ * -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in conn->want ++ * -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in conn->want ++ * -o time_gran=N sets conn->time_gran ++ * ++ * Known options will be removed from *args*, unknown options will be ++ * passed through unchanged. ++ * ++ * @param args argument vector (input+output) ++ * @return parsed options ++ **/ ++struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args); ++ ++/** ++ * This function applies the (parsed) parameters in *opts* to the ++ * *conn* pointer. It may modify the following fields: wants, ++ * max_write, max_readahead, congestion_threshold, max_background, ++ * time_gran. A field is only set (or unset) if the corresponding ++ * option has been explicitly set. ++ */ ++void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, ++ struct fuse_conn_info *conn); ++ ++/** ++ * Go into the background ++ * ++ * @param foreground if true, stay in the foreground ++ * @return 0 on success, -1 on failure ++ */ ++int fuse_daemonize(int foreground); ++ ++/** ++ * Get the version of the library ++ * ++ * @return the version ++ */ ++int fuse_version(void); ++ ++/** ++ * Get the full package version string of the library ++ * ++ * @return the package version ++ */ ++const char *fuse_pkgversion(void); ++ ++/** ++ * Destroy poll handle ++ * ++ * @param ph the poll handle ++ */ ++void fuse_pollhandle_destroy(struct fuse_pollhandle *ph); ++ ++/* ----------------------------------------------------------- * ++ * Data buffer * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Buffer flags ++ */ ++enum fuse_buf_flags { ++ /** ++ * Buffer contains a file descriptor ++ * ++ * If this flag is set, the .fd field is valid, otherwise the ++ * .mem fields is valid. ++ */ ++ FUSE_BUF_IS_FD = (1 << 1), ++ ++ /** ++ * Seek on the file descriptor ++ * ++ * If this flag is set then the .pos field is valid and is ++ * used to seek to the given offset before performing ++ * operation on file descriptor. ++ */ ++ FUSE_BUF_FD_SEEK = (1 << 2), ++ ++ /** ++ * Retry operation on file descriptor ++ * ++ * If this flag is set then retry operation on file descriptor ++ * until .size bytes have been copied or an error or EOF is ++ * detected. ++ */ ++ FUSE_BUF_FD_RETRY = (1 << 3), ++}; ++ ++/** ++ * Buffer copy flags ++ */ ++enum fuse_buf_copy_flags { ++ /** ++ * Don't use splice(2) ++ * ++ * Always fall back to using read and write instead of ++ * splice(2) to copy data from one file descriptor to another. ++ * ++ * If this flag is not set, then only fall back if splice is ++ * unavailable. ++ */ ++ FUSE_BUF_NO_SPLICE = (1 << 1), ++ ++ /** ++ * Force splice ++ * ++ * Always use splice(2) to copy data from one file descriptor ++ * to another. If splice is not available, return -EINVAL. ++ */ ++ FUSE_BUF_FORCE_SPLICE = (1 << 2), ++ ++ /** ++ * Try to move data with splice. ++ * ++ * If splice is used, try to move pages from the source to the ++ * destination instead of copying. See documentation of ++ * SPLICE_F_MOVE in splice(2) man page. ++ */ ++ FUSE_BUF_SPLICE_MOVE = (1 << 3), ++ ++ /** ++ * Don't block on the pipe when copying data with splice ++ * ++ * Makes the operations on the pipe non-blocking (if the pipe ++ * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) ++ * man page. ++ */ ++ FUSE_BUF_SPLICE_NONBLOCK= (1 << 4), ++}; ++ ++/** ++ * Single data buffer ++ * ++ * Generic data buffer for I/O, extended attributes, etc... Data may ++ * be supplied as a memory pointer or as a file descriptor ++ */ ++struct fuse_buf { ++ /** ++ * Size of data in bytes ++ */ ++ size_t size; ++ ++ /** ++ * Buffer flags ++ */ ++ enum fuse_buf_flags flags; ++ ++ /** ++ * Memory pointer ++ * ++ * Used unless FUSE_BUF_IS_FD flag is set. ++ */ ++ void *mem; ++ ++ /** ++ * File descriptor ++ * ++ * Used if FUSE_BUF_IS_FD flag is set. ++ */ ++ int fd; ++ ++ /** ++ * File position ++ * ++ * Used if FUSE_BUF_FD_SEEK flag is set. ++ */ ++ off_t pos; ++}; ++ ++/** ++ * Data buffer vector ++ * ++ * An array of data buffers, each containing a memory pointer or a ++ * file descriptor. ++ * ++ * Allocate dynamically to add more than one buffer. ++ */ ++struct fuse_bufvec { ++ /** ++ * Number of buffers in the array ++ */ ++ size_t count; ++ ++ /** ++ * Index of current buffer within the array ++ */ ++ size_t idx; ++ ++ /** ++ * Current offset within the current buffer ++ */ ++ size_t off; ++ ++ /** ++ * Array of buffers ++ */ ++ struct fuse_buf buf[1]; ++}; ++ ++/* Initialize bufvec with a single buffer of given size */ ++#define FUSE_BUFVEC_INIT(size__) \ ++ ((struct fuse_bufvec) { \ ++ /* .count= */ 1, \ ++ /* .idx = */ 0, \ ++ /* .off = */ 0, \ ++ /* .buf = */ { /* [0] = */ { \ ++ /* .size = */ (size__), \ ++ /* .flags = */ (enum fuse_buf_flags) 0, \ ++ /* .mem = */ NULL, \ ++ /* .fd = */ -1, \ ++ /* .pos = */ 0, \ ++ } } \ ++ } ) ++ ++/** ++ * Get total size of data in a fuse buffer vector ++ * ++ * @param bufv buffer vector ++ * @return size of data ++ */ ++size_t fuse_buf_size(const struct fuse_bufvec *bufv); ++ ++/** ++ * Copy data from one buffer vector to another ++ * ++ * @param dst destination buffer vector ++ * @param src source buffer vector ++ * @param flags flags controlling the copy ++ * @return actual number of bytes copied or -errno on error ++ */ ++ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src, ++ enum fuse_buf_copy_flags flags); ++ ++/* ----------------------------------------------------------- * ++ * Signal handling * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Exit session on HUP, TERM and INT signals and ignore PIPE signal ++ * ++ * Stores session in a global variable. May only be called once per ++ * process until fuse_remove_signal_handlers() is called. ++ * ++ * Once either of the POSIX signals arrives, the signal handler calls ++ * fuse_session_exit(). ++ * ++ * @param se the session to exit ++ * @return 0 on success, -1 on failure ++ * ++ * See also: ++ * fuse_remove_signal_handlers() ++ */ ++int fuse_set_signal_handlers(struct fuse_session *se); ++ ++/** ++ * Restore default signal handlers ++ * ++ * Resets global session. After this fuse_set_signal_handlers() may ++ * be called again. ++ * ++ * @param se the same session as given in fuse_set_signal_handlers() ++ * ++ * See also: ++ * fuse_set_signal_handlers() ++ */ ++void fuse_remove_signal_handlers(struct fuse_session *se); ++ ++/* ----------------------------------------------------------- * ++ * Compatibility stuff * ++ * ----------------------------------------------------------- */ ++ ++#if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30 ++# error only API version 30 or greater is supported ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++ ++/* ++ * This interface uses 64 bit off_t. ++ * ++ * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags! ++ */ ++ ++#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++_Static_assert(sizeof(off_t) == 8, "fuse: off_t must be 64bit"); ++#else ++struct _fuse_off_t_must_be_64bit_dummy_struct \ ++ { unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); }; ++#endif ++ ++#endif /* FUSE_COMMON_H_ */ +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +new file mode 100644 +index 0000000..d38b630 +--- /dev/null ++++ b/tools/virtiofsd/fuse_i.h +@@ -0,0 +1,139 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "fuse.h" ++#include "fuse_lowlevel.h" ++ ++struct mount_opts; ++ ++struct fuse_req { ++ struct fuse_session *se; ++ uint64_t unique; ++ int ctr; ++ pthread_mutex_t lock; ++ struct fuse_ctx ctx; ++ struct fuse_chan *ch; ++ int interrupted; ++ unsigned int ioctl_64bit : 1; ++ union { ++ struct { ++ uint64_t unique; ++ } i; ++ struct { ++ fuse_interrupt_func_t func; ++ void *data; ++ } ni; ++ } u; ++ struct fuse_req *next; ++ struct fuse_req *prev; ++}; ++ ++struct fuse_notify_req { ++ uint64_t unique; ++ void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, ++ const void *, const struct fuse_buf *); ++ struct fuse_notify_req *next; ++ struct fuse_notify_req *prev; ++}; ++ ++struct fuse_session { ++ char *mountpoint; ++ volatile int exited; ++ int fd; ++ struct mount_opts *mo; ++ int debug; ++ int deny_others; ++ struct fuse_lowlevel_ops op; ++ int got_init; ++ struct cuse_data *cuse_data; ++ void *userdata; ++ uid_t owner; ++ struct fuse_conn_info conn; ++ struct fuse_req list; ++ struct fuse_req interrupts; ++ pthread_mutex_t lock; ++ int got_destroy; ++ pthread_key_t pipe_key; ++ int broken_splice_nonblock; ++ uint64_t notify_ctr; ++ struct fuse_notify_req notify_list; ++ size_t bufsize; ++ int error; ++}; ++ ++struct fuse_chan { ++ pthread_mutex_t lock; ++ int ctr; ++ int fd; ++}; ++ ++/** ++ * Filesystem module ++ * ++ * Filesystem modules are registered with the FUSE_REGISTER_MODULE() ++ * macro. ++ * ++ */ ++struct fuse_module { ++ char *name; ++ fuse_module_factory_t factory; ++ struct fuse_module *next; ++ struct fusemod_so *so; ++ int ctr; ++}; ++ ++/* ----------------------------------------------------------- * ++ * Channel interface (when using -o clone_fd) * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Obtain counted reference to the channel ++ * ++ * @param ch the channel ++ * @return the channel ++ */ ++struct fuse_chan *fuse_chan_get(struct fuse_chan *ch); ++ ++/** ++ * Drop counted reference to a channel ++ * ++ * @param ch the channel ++ */ ++void fuse_chan_put(struct fuse_chan *ch); ++ ++struct mount_opts *parse_mount_opts(struct fuse_args *args); ++void destroy_mount_opts(struct mount_opts *mo); ++void fuse_mount_version(void); ++unsigned get_max_read(struct mount_opts *o); ++void fuse_kern_unmount(const char *mountpoint, int fd); ++int fuse_kern_mount(const char *mountpoint, struct mount_opts *mo); ++ ++int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, ++ int count); ++void fuse_free_req(fuse_req_t req); ++ ++void cuse_lowlevel_init(fuse_req_t req, fuse_ino_t nodeide, const void *inarg); ++ ++int fuse_start_thread(pthread_t *thread_id, void *(*func)(void *), void *arg); ++ ++int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, ++ struct fuse_chan *ch); ++void fuse_session_process_buf_int(struct fuse_session *se, ++ const struct fuse_buf *buf, struct fuse_chan *ch); ++ ++struct fuse *fuse_new_31(struct fuse_args *args, const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++int fuse_loop_mt_32(struct fuse *f, struct fuse_loop_config *config); ++int fuse_session_loop_mt_32(struct fuse_session *se, struct fuse_loop_config *config); ++ ++#define FUSE_MAX_MAX_PAGES 256 ++#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 ++ ++/* room needed in buffer to accommodate header */ ++#define FUSE_BUFFER_HEADER_SIZE 0x1000 ++ +diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h +new file mode 100644 +index 0000000..5e112e0 +--- /dev/null ++++ b/tools/virtiofsd/fuse_log.h +@@ -0,0 +1,82 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2019 Red Hat, Inc. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_LOG_H_ ++#define FUSE_LOG_H_ ++ ++/** @file ++ * ++ * This file defines the logging interface of FUSE ++ */ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Log severity level ++ * ++ * These levels correspond to syslog(2) log levels since they are widely used. ++ */ ++enum fuse_log_level { ++ FUSE_LOG_EMERG, ++ FUSE_LOG_ALERT, ++ FUSE_LOG_CRIT, ++ FUSE_LOG_ERR, ++ FUSE_LOG_WARNING, ++ FUSE_LOG_NOTICE, ++ FUSE_LOG_INFO, ++ FUSE_LOG_DEBUG ++}; ++ ++/** ++ * Log message handler function. ++ * ++ * This function must be thread-safe. It may be called from any libfuse ++ * function, including fuse_parse_cmdline() and other functions invoked before ++ * a FUSE filesystem is created. ++ * ++ * Install a custom log message handler function using fuse_set_log_func(). ++ * ++ * @param level log severity level ++ * @param fmt sprintf-style format string including newline ++ * @param ap format string arguments ++ */ ++typedef void (*fuse_log_func_t)(enum fuse_log_level level, ++ const char *fmt, va_list ap); ++ ++/** ++ * Install a custom log handler function. ++ * ++ * Log messages are emitted by libfuse functions to report errors and debug ++ * information. Messages are printed to stderr by default but this can be ++ * overridden by installing a custom log message handler function. ++ * ++ * The log message handler function is global and affects all FUSE filesystems ++ * created within this process. ++ * ++ * @param func a custom log message handler function or NULL to revert to ++ * the default ++ */ ++void fuse_set_log_func(fuse_log_func_t func); ++ ++/** ++ * Emit a log message ++ * ++ * @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc) ++ * @param fmt sprintf-style format string including newline ++ */ ++void fuse_log(enum fuse_log_level level, const char *fmt, ...); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_LOG_H_ */ +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +new file mode 100644 +index 0000000..18c6363 +--- /dev/null ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -0,0 +1,2089 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_LOWLEVEL_H_ ++#define FUSE_LOWLEVEL_H_ ++ ++/** @file ++ * ++ * Low level API ++ * ++ * IMPORTANT: you should define FUSE_USE_VERSION before including this ++ * header. To use the newest API define it to 31 (recommended for any ++ * new application). ++ */ ++ ++#ifndef FUSE_USE_VERSION ++#error FUSE_USE_VERSION not defined ++#endif ++ ++#include "fuse_common.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ----------------------------------------------------------- * ++ * Miscellaneous definitions * ++ * ----------------------------------------------------------- */ ++ ++/** The node ID of the root inode */ ++#define FUSE_ROOT_ID 1 ++ ++/** Inode number type */ ++typedef uint64_t fuse_ino_t; ++ ++/** Request pointer type */ ++typedef struct fuse_req *fuse_req_t; ++ ++/** ++ * Session ++ * ++ * This provides hooks for processing requests, and exiting ++ */ ++struct fuse_session; ++ ++/** Directory entry parameters supplied to fuse_reply_entry() */ ++struct fuse_entry_param { ++ /** Unique inode number ++ * ++ * In lookup, zero means negative entry (from version 2.5) ++ * Returning ENOENT also means negative entry, but by setting zero ++ * ino the kernel may cache negative entries for entry_timeout ++ * seconds. ++ */ ++ fuse_ino_t ino; ++ ++ /** Generation number for this entry. ++ * ++ * If the file system will be exported over NFS, the ++ * ino/generation pairs need to be unique over the file ++ * system's lifetime (rather than just the mount time). So if ++ * the file system reuses an inode after it has been deleted, ++ * it must assign a new, previously unused generation number ++ * to the inode at the same time. ++ * ++ */ ++ uint64_t generation; ++ ++ /** Inode attributes. ++ * ++ * Even if attr_timeout == 0, attr must be correct. For example, ++ * for open(), FUSE uses attr.st_size from lookup() to determine ++ * how many bytes to request. If this value is not correct, ++ * incorrect data will be returned. ++ */ ++ struct stat attr; ++ ++ /** Validity timeout (in seconds) for inode attributes. If ++ attributes only change as a result of requests that come ++ through the kernel, this should be set to a very large ++ value. */ ++ double attr_timeout; ++ ++ /** Validity timeout (in seconds) for the name. If directory ++ entries are changed/deleted only as a result of requests ++ that come through the kernel, this should be set to a very ++ large value. */ ++ double entry_timeout; ++}; ++ ++/** ++ * Additional context associated with requests. ++ * ++ * Note that the reported client uid, gid and pid may be zero in some ++ * situations. For example, if the FUSE file system is running in a ++ * PID or user namespace but then accessed from outside the namespace, ++ * there is no valid uid/pid/gid that could be reported. ++ */ ++struct fuse_ctx { ++ /** User ID of the calling process */ ++ uid_t uid; ++ ++ /** Group ID of the calling process */ ++ gid_t gid; ++ ++ /** Thread ID of the calling process */ ++ pid_t pid; ++ ++ /** Umask of the calling process */ ++ mode_t umask; ++}; ++ ++struct fuse_forget_data { ++ fuse_ino_t ino; ++ uint64_t nlookup; ++}; ++ ++/* 'to_set' flags in setattr */ ++#define FUSE_SET_ATTR_MODE (1 << 0) ++#define FUSE_SET_ATTR_UID (1 << 1) ++#define FUSE_SET_ATTR_GID (1 << 2) ++#define FUSE_SET_ATTR_SIZE (1 << 3) ++#define FUSE_SET_ATTR_ATIME (1 << 4) ++#define FUSE_SET_ATTR_MTIME (1 << 5) ++#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) ++#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) ++#define FUSE_SET_ATTR_CTIME (1 << 10) ++ ++/* ----------------------------------------------------------- * ++ * Request methods and replies * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Low level filesystem operations ++ * ++ * Most of the methods (with the exception of init and destroy) ++ * receive a request handle (fuse_req_t) as their first argument. ++ * This handle must be passed to one of the specified reply functions. ++ * ++ * This may be done inside the method invocation, or after the call ++ * has returned. The request handle is valid until one of the reply ++ * functions is called. ++ * ++ * Other pointer arguments (name, fuse_file_info, etc) are not valid ++ * after the call has returned, so if they are needed later, their ++ * contents have to be copied. ++ * ++ * In general, all methods are expected to perform any necessary ++ * permission checking. However, a filesystem may delegate this task ++ * to the kernel by passing the `default_permissions` mount option to ++ * `fuse_session_new()`. In this case, methods will only be called if ++ * the kernel's permission check has succeeded. ++ * ++ * The filesystem sometimes needs to handle a return value of -ENOENT ++ * from the reply function, which means, that the request was ++ * interrupted, and the reply discarded. For example if ++ * fuse_reply_open() return -ENOENT means, that the release method for ++ * this file will not be called. ++ */ ++struct fuse_lowlevel_ops { ++ /** ++ * Initialize filesystem ++ * ++ * This function is called when libfuse establishes ++ * communication with the FUSE kernel module. The file system ++ * should use this module to inspect and/or modify the ++ * connection parameters provided in the `conn` structure. ++ * ++ * Note that some parameters may be overwritten by options ++ * passed to fuse_session_new() which take precedence over the ++ * values set in this handler. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*init) (void *userdata, struct fuse_conn_info *conn); ++ ++ /** ++ * Clean up filesystem. ++ * ++ * Called on filesystem exit. When this method is called, the ++ * connection to the kernel may be gone already, so that eg. calls ++ * to fuse_lowlevel_notify_* will fail. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*destroy) (void *userdata); ++ ++ /** ++ * Look up a directory entry by name and get its attributes. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name the name to look up ++ */ ++ void (*lookup) (fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Forget about an inode ++ * ++ * This function is called when the kernel removes an inode ++ * from its internal caches. ++ * ++ * The inode's lookup count increases by one for every call to ++ * fuse_reply_entry and fuse_reply_create. The nlookup parameter ++ * indicates by how much the lookup count should be decreased. ++ * ++ * Inodes with a non-zero lookup count may receive request from ++ * the kernel even after calls to unlink, rmdir or (when ++ * overwriting an existing file) rename. Filesystems must handle ++ * such requests properly and it is recommended to defer removal ++ * of the inode until the lookup count reaches zero. Calls to ++ * unlink, rmdir or rename will be followed closely by forget ++ * unless the file or directory is open, in which case the ++ * kernel issues forget only after the release or releasedir ++ * calls. ++ * ++ * Note that if a file system will be exported over NFS the ++ * inodes lifetime must extend even beyond forget. See the ++ * generation field in struct fuse_entry_param above. ++ * ++ * On unmount the lookup count for all inodes implicitly drops ++ * to zero. It is not guaranteed that the file system will ++ * receive corresponding forget messages for the affected ++ * inodes. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param nlookup the number of lookups to forget ++ */ ++ void (*forget) (fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); ++ ++ /** ++ * Get file attributes. ++ * ++ * If writeback caching is enabled, the kernel may have a ++ * better idea of a file's length than the FUSE file system ++ * (eg if there has been a write that extended the file size, ++ * but that has not yet been passed to the filesystem.n ++ * ++ * In this case, the st_size value provided by the file system ++ * will be ignored. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi for future use, currently always NULL ++ */ ++ void (*getattr) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Set file attributes ++ * ++ * In the 'attr' argument only members indicated by the 'to_set' ++ * bitmask contain valid values. Other members contain undefined ++ * values. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits if the file ++ * size or owner is being changed. ++ * ++ * If the setattr was invoked from the ftruncate() system call ++ * under Linux kernel versions 2.6.15 or later, the fi->fh will ++ * contain the value set by the open method or will be undefined ++ * if the open method didn't set any value. Otherwise (not ++ * ftruncate call, or kernel version earlier than 2.6.15) the fi ++ * parameter will be NULL. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param attr the attributes ++ * @param to_set bit mask of attributes which should be set ++ * @param fi file information, or NULL ++ */ ++ void (*setattr) (fuse_req_t req, fuse_ino_t ino, struct stat *attr, ++ int to_set, struct fuse_file_info *fi); ++ ++ /** ++ * Read symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_readlink ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ */ ++ void (*readlink) (fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Create file node ++ * ++ * Create a regular file, character device, block device, fifo or ++ * socket node. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param rdev the device number (only valid if created file is a device) ++ */ ++ void (*mknod) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, dev_t rdev); ++ ++ /** ++ * Create a directory ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode with which to create the new file ++ */ ++ void (*mkdir) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode); ++ ++ /** ++ * Remove a file ++ * ++ * If the file's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*unlink) (fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Remove a directory ++ * ++ * If the directory's inode's lookup count is non-zero, the ++ * file system is expected to postpone any removal of the ++ * inode until the lookup count reaches zero (see description ++ * of the forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*rmdir) (fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Create a symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param link the contents of the symbolic link ++ * @param parent inode number of the parent directory ++ * @param name to create ++ */ ++ void (*symlink) (fuse_req_t req, const char *link, fuse_ino_t parent, ++ const char *name); ++ ++ /** Rename a file ++ * ++ * If the target exists it should be atomically replaced. If ++ * the target's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EINVAL, i.e. all ++ * future bmap requests will fail with EINVAL without being ++ * send to the filesystem process. ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the old parent directory ++ * @param name old name ++ * @param newparent inode number of the new parent directory ++ * @param newname new name ++ */ ++ void (*rename) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags); ++ ++ /** ++ * Create a hard link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the old inode number ++ * @param newparent inode number of the new parent directory ++ * @param newname new name to create ++ */ ++ void (*link) (fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, ++ const char *newname); ++ ++ /** ++ * Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used ++ * by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount ++ * option is given, this check is already done by the ++ * kernel before calling open() and may thus be omitted by ++ * the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open and release will also succeed without being ++ * sent to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*open) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Read data ++ * ++ * Read should send exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the file ++ * has been opened in 'direct_io' mode, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_iov ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size number of bytes to read ++ * @param off offset to read from ++ * @param fi file information ++ */ ++ void (*read) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Write data ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the file has ++ * been opened in 'direct_io' mode, in which case the return value ++ * of the write system call will reflect the return value of this ++ * operation. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param buf data to write ++ * @param size number of bytes to write ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write) (fuse_req_t req, fuse_ino_t ino, const char *buf, ++ size_t size, off_t off, struct fuse_file_info *fi); ++ ++ /** ++ * Flush method ++ * ++ * This is called on each close() of the opened file. ++ * ++ * Since file descriptors can be duplicated (dup, dup2, fork), for ++ * one open call there may be many flush calls. ++ * ++ * Filesystems shouldn't assume that flush will always be called ++ * after some writes, or that if will be called at all. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * NOTE: the name of the method is misleading, since (unlike ++ * fsync) the filesystem is not forced to flush pending writes. ++ * One reason to flush data is if the filesystem wants to return ++ * write errors during close. However, such use is non-portable ++ * because POSIX does not require [close] to wait for delayed I/O to ++ * complete. ++ * ++ * If the filesystem supports file locking operations (setlk, ++ * getlk) it should remove all locks belonging to 'fi->owner'. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to flush() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * ++ * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ void (*flush) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open call there will be exactly one release call (unless ++ * the filesystem is force-unmounted). ++ * ++ * The filesystem may reply with an error, but error values are ++ * not returned to close() or munmap() which triggered the ++ * release. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * fi->flags will contain the same flags as for open. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*release) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsync() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsync) (fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Open a directory ++ * ++ * Filesystem may store an arbitrary file handle (pointer, index, ++ * etc) in fi->fh, and use this in other all other directory ++ * stream operations (readdir, releasedir, fsyncdir). ++ * ++ * If this request is answered with an error code of ENOSYS and ++ * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, ++ * this is treated as success and future calls to opendir and ++ * releasedir will also succeed without being sent to the filesystem ++ * process. In addition, the kernel will cache readdir results ++ * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*opendir) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Read directory ++ * ++ * Send a buffer filled using fuse_add_direntry(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Returning a directory entry from readdir() does not affect ++ * its lookup count. ++ * ++ * If off_t is non-zero, then it will correspond to one of the off_t ++ * values that was previously returned by readdir() for the same ++ * directory handle. In this case, readdir() should skip over entries ++ * coming before the position defined by the off_t value. If entries ++ * are added or removed while the directory handle is open, they filesystem ++ * may still include the entries that have been removed, and may not ++ * report the entries that have been created. However, addition or ++ * removal of entries must never cause readdir() to skip over unrelated ++ * entries or to report them more than once. This means ++ * that off_t can not be a simple index that enumerates the entries ++ * that have been returned but must contain sufficient information to ++ * uniquely determine the next directory entry to return even when the ++ * set of entries is changing. ++ * ++ * The function does not have to report the '.' and '..' ++ * entries, but is allowed to do so. Note that, if readdir does ++ * not return '.' or '..', they will not be implicitly returned, ++ * and this behavior is observable by the caller. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdir) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Release an open directory ++ * ++ * For every opendir call there will be exactly one releasedir ++ * call (unless the filesystem is force-unmounted). ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*releasedir) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the directory ++ * contents should be flushed, not the meta data. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsyncdir() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsyncdir) (fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Get file system statistics ++ * ++ * Valid replies: ++ * fuse_reply_statfs ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number, zero means "undefined" ++ */ ++ void (*statfs) (fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Set an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future setxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ */ ++ void (*setxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, ++ const char *value, size_t size, int flags); ++ ++ /** ++ * Get an extended attribute ++ * ++ * If size is zero, the size of the value should be sent with ++ * fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the value fits in the buffer, the ++ * value should be sent with fuse_reply_buf. ++ * ++ * If the size is too small for the value, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future getxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ * @param size maximum size of the value to send ++ */ ++ void (*getxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, ++ size_t size); ++ ++ /** ++ * List extended attribute names ++ * ++ * If size is zero, the total size of the attribute list should be ++ * sent with fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the null character separated ++ * attribute list fits in the buffer, the list should be sent with ++ * fuse_reply_buf. ++ * ++ * If the size is too small for the list, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future listxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum size of the list to send ++ */ ++ void (*listxattr) (fuse_req_t req, fuse_ino_t ino, size_t size); ++ ++ /** ++ * Remove an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future removexattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ */ ++ void (*removexattr) (fuse_req_t req, fuse_ino_t ino, const char *name); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() and chdir() system ++ * calls. If the 'default_permissions' mount option is given, ++ * this method is not called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent success, i.e. this and all future access() ++ * requests will succeed without being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param mask requested access mode ++ */ ++ void (*access) (fuse_req_t req, fuse_ino_t ino, int mask); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * See the description of the open handler for more ++ * information. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ * ++ * If this request is answered with an error code of ENOSYS, the handler ++ * is treated as not implemented (i.e., for this and future requests the ++ * mknod() and open() handlers will be called instead). ++ * ++ * Valid replies: ++ * fuse_reply_create ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param fi file information ++ */ ++ void (*create) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, struct fuse_file_info *fi); ++ ++ /** ++ * Test for a POSIX file lock ++ * ++ * Valid replies: ++ * fuse_reply_lock ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to test ++ */ ++ void (*getlk) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi, struct flock *lock); ++ ++ /** ++ * Acquire, modify or release a POSIX file lock ++ * ++ * For POSIX threads (NPTL) there's a 1-1 relation between pid and ++ * owner, but otherwise this is not always the case. For checking ++ * lock ownership, 'fi->owner' must be used. The l_pid field in ++ * 'struct flock' should only be used to fill in this field in ++ * getlk(). ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to set ++ * @param sleep locking operation may sleep ++ */ ++ void (*setlk) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi, ++ struct flock *lock, int sleep); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future bmap() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_bmap ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param blocksize unit of block index ++ * @param idx block index within file ++ */ ++ void (*bmap) (fuse_req_t req, fuse_ino_t ino, size_t blocksize, ++ uint64_t idx); ++ ++ /** ++ * Ioctl ++ * ++ * Note: For unrestricted ioctls (not allowed for FUSE ++ * servers), data in and out areas can be discovered by giving ++ * iovs and setting FUSE_IOCTL_RETRY in *flags*. For ++ * restricted ioctls, kernel prepares in/out data area ++ * according to the information encoded in cmd. ++ * ++ * Valid replies: ++ * fuse_reply_ioctl_retry ++ * fuse_reply_ioctl ++ * fuse_reply_ioctl_iov ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param cmd ioctl command ++ * @param arg ioctl argument ++ * @param fi file information ++ * @param flags for FUSE_IOCTL_* flags ++ * @param in_buf data fetched from the caller ++ * @param in_bufsz number of fetched bytes ++ * @param out_bufsz maximum size of output data ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ void (*ioctl) (fuse_req_t req, fuse_ino_t ino, unsigned int cmd, ++ void *arg, struct fuse_file_info *fi, unsigned flags, ++ const void *in_buf, size_t in_bufsz, size_t out_bufsz); ++ ++ /** ++ * Poll for IO readiness ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_lowlevel_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as success (with a kernel-defined default poll-mask) and ++ * future calls to pull() will succeed the same way without being send ++ * to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_poll ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param ph poll handle to be used for notification ++ */ ++ void (*poll) (fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct fuse_pollhandle *ph); ++ ++ /** ++ * Write data made available in a buffer ++ * ++ * This is a more generic version of the ->write() method. If ++ * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the ++ * kernel supports splicing from the fuse device, then the ++ * data will be made available in pipe for supporting zero ++ * copy data transfer. ++ * ++ * buf->count is guaranteed to be one (and thus buf->idx is ++ * always zero). The write_buf handler must ensure that ++ * bufv->off is correctly updated (reflecting the number of ++ * bytes read from bufv->buf[0]). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param bufv buffer containing the data ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write_buf) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_bufvec *bufv, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Callback function for the retrieve request ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() ++ * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() ++ * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() ++ * @param bufv the buffer containing the returned data ++ */ ++ void (*retrieve_reply) (fuse_req_t req, void *cookie, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv); ++ ++ /** ++ * Forget about multiple inodes ++ * ++ * See description of the forget function for more ++ * information. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ */ ++ void (*forget_multi) (fuse_req_t req, size_t count, ++ struct fuse_forget_data *forgets); ++ ++ /** ++ * Acquire, modify or release a BSD file lock ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param op the locking operation, see flock(2) ++ */ ++ void (*flock) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi, int op); ++ ++ /** ++ * Allocate requested space. If this function returns success then ++ * subsequent writes to the specified range shall not fail due to the lack ++ * of free space on the file system storage media. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future fallocate() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param offset starting point for allocated region ++ * @param length size of allocated region ++ * @param mode determines the operation to be performed on the given range, ++ * see fallocate(2) ++ */ ++ void (*fallocate) (fuse_req_t req, fuse_ino_t ino, int mode, ++ off_t offset, off_t length, struct fuse_file_info *fi); ++ ++ /** ++ * Read directory with attributes ++ * ++ * Send a buffer filled using fuse_add_direntry_plus(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * In contrast to readdir() (which does not affect the lookup counts), ++ * the lookup count of every entry returned by readdirplus(), except "." ++ * and "..", is incremented by one. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdirplus) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future copy_file_range() requests will fail with EOPNOTSUPP without ++ * being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino_in the inode number or the source file ++ * @param off_in starting point from were the data should be read ++ * @param fi_in file information of the source file ++ * @param ino_out the inode number or the destination file ++ * @param off_out starting point where the data should be written ++ * @param fi_out file information of the destination file ++ * @param len maximum size of the data to copy ++ * @param flags passed along with the copy_file_range() syscall ++ */ ++ void (*copy_file_range) (fuse_req_t req, fuse_ino_t ino_in, ++ off_t off_in, struct fuse_file_info *fi_in, ++ fuse_ino_t ino_out, off_t off_out, ++ struct fuse_file_info *fi_out, size_t len, ++ int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future lseek() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_lseek ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param off offset to start search from ++ * @param whence either SEEK_DATA or SEEK_HOLE ++ * @param fi file information ++ */ ++ void (*lseek) (fuse_req_t req, fuse_ino_t ino, off_t off, int whence, ++ struct fuse_file_info *fi); ++}; ++ ++/** ++ * Reply with an error code or success. ++ * ++ * Possible requests: ++ * all except forget ++ * ++ * Whereever possible, error codes should be chosen from the list of ++ * documented error conditions in the corresponding system calls ++ * manpage. ++ * ++ * An error code of ENOSYS is sometimes treated specially. This is ++ * indicated in the documentation of the affected handler functions. ++ * ++ * The following requests may be answered with a zero error code: ++ * unlink, rmdir, rename, flush, release, fsync, fsyncdir, setxattr, ++ * removexattr, setlk. ++ * ++ * @param req request handle ++ * @param err the positive error value, or zero for success ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_err(fuse_req_t req, int err); ++ ++/** ++ * Don't send reply ++ * ++ * Possible requests: ++ * forget ++ * forget_multi ++ * retrieve_reply ++ * ++ * @param req request handle ++ */ ++void fuse_reply_none(fuse_req_t req); ++ ++/** ++ * Reply with a directory entry ++ * ++ * Possible requests: ++ * lookup, mknod, mkdir, symlink, link ++ * ++ * Side effects: ++ * increments the lookup count on success ++ * ++ * @param req request handle ++ * @param e the entry parameters ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e); ++ ++/** ++ * Reply with a directory entry and open parameters ++ * ++ * currently the following members of 'fi' are used: ++ * fh, direct_io, keep_cache ++ * ++ * Possible requests: ++ * create ++ * ++ * Side effects: ++ * increments the lookup count on success ++ * ++ * @param req request handle ++ * @param e the entry parameters ++ * @param fi file information ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, ++ const struct fuse_file_info *fi); ++ ++/** ++ * Reply with attributes ++ * ++ * Possible requests: ++ * getattr, setattr ++ * ++ * @param req request handle ++ * @param attr the attributes ++ * @param attr_timeout validity timeout (in seconds) for the attributes ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_attr(fuse_req_t req, const struct stat *attr, ++ double attr_timeout); ++ ++/** ++ * Reply with the contents of a symbolic link ++ * ++ * Possible requests: ++ * readlink ++ * ++ * @param req request handle ++ * @param link symbolic link contents ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_readlink(fuse_req_t req, const char *link); ++ ++/** ++ * Reply with open parameters ++ * ++ * currently the following members of 'fi' are used: ++ * fh, direct_io, keep_cache ++ * ++ * Possible requests: ++ * open, opendir ++ * ++ * @param req request handle ++ * @param fi file information ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *fi); ++ ++/** ++ * Reply with number of bytes written ++ * ++ * Possible requests: ++ * write ++ * ++ * @param req request handle ++ * @param count the number of bytes written ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_write(fuse_req_t req, size_t count); ++ ++/** ++ * Reply with data ++ * ++ * Possible requests: ++ * read, readdir, getxattr, listxattr ++ * ++ * @param req request handle ++ * @param buf buffer containing data ++ * @param size the size of data in bytes ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); ++ ++/** ++ * Reply with data copied/moved from buffer(s) ++ * ++ * Zero copy data transfer ("splicing") will be used under ++ * the following circumstances: ++ * ++ * 1. FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.want, and ++ * 2. the kernel supports splicing from the fuse device ++ * (FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.capable), and ++ * 3. *flags* does not contain FUSE_BUF_NO_SPLICE ++ * 4. The amount of data that is provided in file-descriptor backed ++ * buffers (i.e., buffers for which bufv[n].flags == FUSE_BUF_FD) ++ * is at least twice the page size. ++ * ++ * In order for SPLICE_F_MOVE to be used, the following additional ++ * conditions have to be fulfilled: ++ * ++ * 1. FUSE_CAP_SPLICE_MOVE is set in fuse_conn_info.want, and ++ * 2. the kernel supports it (i.e, FUSE_CAP_SPLICE_MOVE is set in ++ fuse_conn_info.capable), and ++ * 3. *flags* contains FUSE_BUF_SPLICE_MOVE ++ * ++ * Note that, if splice is used, the data is actually spliced twice: ++ * once into a temporary pipe (to prepend header data), and then again ++ * into the kernel. If some of the provided buffers are memory-backed, ++ * the data in them is copied in step one and spliced in step two. ++ * ++ * The FUSE_BUF_SPLICE_FORCE_SPLICE and FUSE_BUF_SPLICE_NONBLOCK flags ++ * are silently ignored. ++ * ++ * Possible requests: ++ * read, readdir, getxattr, listxattr ++ * ++ * Side effects: ++ * when used to return data from a readdirplus() (but not readdir()) ++ * call, increments the lookup count of each returned entry by one ++ * on success. ++ * ++ * @param req request handle ++ * @param bufv buffer vector ++ * @param flags flags controlling the copy ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags); ++ ++/** ++ * Reply with data vector ++ * ++ * Possible requests: ++ * read, readdir, getxattr, listxattr ++ * ++ * @param req request handle ++ * @param iov the vector containing the data ++ * @param count the size of vector ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count); ++ ++/** ++ * Reply with filesystem statistics ++ * ++ * Possible requests: ++ * statfs ++ * ++ * @param req request handle ++ * @param stbuf filesystem statistics ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf); ++ ++/** ++ * Reply with needed buffer size ++ * ++ * Possible requests: ++ * getxattr, listxattr ++ * ++ * @param req request handle ++ * @param count the buffer size needed in bytes ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_xattr(fuse_req_t req, size_t count); ++ ++/** ++ * Reply with file lock information ++ * ++ * Possible requests: ++ * getlk ++ * ++ * @param req request handle ++ * @param lock the lock information ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_lock(fuse_req_t req, const struct flock *lock); ++ ++/** ++ * Reply with block index ++ * ++ * Possible requests: ++ * bmap ++ * ++ * @param req request handle ++ * @param idx block index within device ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_bmap(fuse_req_t req, uint64_t idx); ++ ++/* ----------------------------------------------------------- * ++ * Filling a buffer in readdir * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Add a directory entry to the buffer ++ * ++ * Buffer needs to be large enough to hold the entry. If it's not, ++ * then the entry is not filled in but the size of the entry is still ++ * returned. The caller can check this by comparing the bufsize ++ * parameter with the returned entry size. If the entry size is ++ * larger than the buffer size, the operation failed. ++ * ++ * From the 'stbuf' argument the st_ino field and bits 12-15 of the ++ * st_mode field are used. The other fields are ignored. ++ * ++ * *off* should be any non-zero value that the filesystem can use to ++ * identify the current point in the directory stream. It does not ++ * need to be the actual physical position. A value of zero is ++ * reserved to mean "from the beginning", and should therefore never ++ * be used (the first call to fuse_add_direntry should be passed the ++ * offset of the second directory entry). ++ * ++ * @param req request handle ++ * @param buf the point where the new entry will be added to the buffer ++ * @param bufsize remaining size of the buffer ++ * @param name the name of the entry ++ * @param stbuf the file attributes ++ * @param off the offset of the next entry ++ * @return the space needed for the entry ++ */ ++size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, const struct stat *stbuf, ++ off_t off); ++ ++/** ++ * Add a directory entry to the buffer with the attributes ++ * ++ * See documentation of `fuse_add_direntry()` for more details. ++ * ++ * @param req request handle ++ * @param buf the point where the new entry will be added to the buffer ++ * @param bufsize remaining size of the buffer ++ * @param name the name of the entry ++ * @param e the directory entry ++ * @param off the offset of the next entry ++ * @return the space needed for the entry ++ */ ++size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, ++ const struct fuse_entry_param *e, off_t off); ++ ++/** ++ * Reply to ask for data fetch and output buffer preparation. ioctl ++ * will be retried with the specified input data fetched and output ++ * buffer prepared. ++ * ++ * Possible requests: ++ * ioctl ++ * ++ * @param req request handle ++ * @param in_iov iovec specifying data to fetch from the caller ++ * @param in_count number of entries in in_iov ++ * @param out_iov iovec specifying addresses to write output to ++ * @param out_count number of entries in out_iov ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_ioctl_retry(fuse_req_t req, ++ const struct iovec *in_iov, size_t in_count, ++ const struct iovec *out_iov, size_t out_count); ++ ++/** ++ * Reply to finish ioctl ++ * ++ * Possible requests: ++ * ioctl ++ * ++ * @param req request handle ++ * @param result result to be passed to the caller ++ * @param buf buffer containing output data ++ * @param size length of output data ++ */ ++int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size); ++ ++/** ++ * Reply to finish ioctl with iov buffer ++ * ++ * Possible requests: ++ * ioctl ++ * ++ * @param req request handle ++ * @param result result to be passed to the caller ++ * @param iov the vector containing the data ++ * @param count the size of vector ++ */ ++int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, ++ int count); ++ ++/** ++ * Reply with poll result event mask ++ * ++ * @param req request handle ++ * @param revents poll result event mask ++ */ ++int fuse_reply_poll(fuse_req_t req, unsigned revents); ++ ++/** ++ * Reply with offset ++ * ++ * Possible requests: ++ * lseek ++ * ++ * @param req request handle ++ * @param off offset of next data or hole ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_lseek(fuse_req_t req, off_t off); ++ ++/* ----------------------------------------------------------- * ++ * Notification * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Notify IO readiness event ++ * ++ * For more information, please read comment for poll operation. ++ * ++ * @param ph poll handle to notify IO readiness event for ++ */ ++int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph); ++ ++/** ++ * Notify to invalidate cache for an inode. ++ * ++ * Added in FUSE protocol version 7.12. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * If the filesystem has writeback caching enabled, invalidating an ++ * inode will first trigger a writeback of all dirty pages. The call ++ * will block until all writeback requests have completed and the ++ * inode has been invalidated. It will, however, not wait for ++ * completion of pending writeback requests that have been issued ++ * before. ++ * ++ * If there are no dirty pages, this function will never block. ++ * ++ * @param se the session object ++ * @param ino the inode number ++ * @param off the offset in the inode where to start invalidating ++ * or negative to invalidate attributes only ++ * @param len the amount of cache to invalidate or 0 for all ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, ++ off_t off, off_t len); ++ ++/** ++ * Notify to invalidate parent attributes and the dentry matching ++ * parent/name ++ * ++ * To avoid a deadlock this function must not be called in the ++ * execution path of a related filesytem operation or within any code ++ * that could hold a lock that could be needed to execute such an ++ * operation. As of kernel 4.18, a "related operation" is a lookup(), ++ * symlink(), mknod(), mkdir(), unlink(), rename(), link() or create() ++ * request for the parent, and a setattr(), unlink(), rmdir(), ++ * rename(), setxattr(), removexattr(), readdir() or readdirplus() ++ * request for the inode itself. ++ * ++ * When called correctly, this function will never block. ++ * ++ * Added in FUSE protocol version 7.12. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param parent inode number ++ * @param name file name ++ * @param namelen strlen() of file name ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, ++ const char *name, size_t namelen); ++ ++/** ++ * This function behaves like fuse_lowlevel_notify_inval_entry() with ++ * the following additional effect (at least as of Linux kernel 4.8): ++ * ++ * If the provided *child* inode matches the inode that is currently ++ * associated with the cached dentry, and if there are any inotify ++ * watches registered for the dentry, then the watchers are informed ++ * that the dentry has been deleted. ++ * ++ * To avoid a deadlock this function must not be called while ++ * executing a related filesytem operation or while holding a lock ++ * that could be needed to execute such an operation (see the ++ * description of fuse_lowlevel_notify_inval_entry() for more ++ * details). ++ * ++ * When called correctly, this function will never block. ++ * ++ * Added in FUSE protocol version 7.18. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param parent inode number ++ * @param child inode number ++ * @param name file name ++ * @param namelen strlen() of file name ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_delete(struct fuse_session *se, ++ fuse_ino_t parent, fuse_ino_t child, ++ const char *name, size_t namelen); ++ ++/** ++ * Store data to the kernel buffers ++ * ++ * Synchronously store data in the kernel buffers belonging to the ++ * given inode. The stored data is marked up-to-date (no read will be ++ * performed against it, unless it's invalidated or evicted from the ++ * cache). ++ * ++ * If the stored data overflows the current file size, then the size ++ * is extended, similarly to a write(2) on the filesystem. ++ * ++ * If this function returns an error, then the store wasn't fully ++ * completed, but it may have been partially completed. ++ * ++ * Added in FUSE protocol version 7.15. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param ino the inode number ++ * @param offset the starting offset into the file to store to ++ * @param bufv buffer vector ++ * @param flags flags controlling the copy ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags); ++/** ++ * Retrieve data from the kernel buffers ++ * ++ * Retrieve data in the kernel buffers belonging to the given inode. ++ * If successful then the retrieve_reply() method will be called with ++ * the returned data. ++ * ++ * Only present pages are returned in the retrieve reply. Retrieving ++ * stops when it finds a non-present page and only data prior to that ++ * is returned. ++ * ++ * If this function returns an error, then the retrieve will not be ++ * completed and no reply will be sent. ++ * ++ * This function doesn't change the dirty state of pages in the kernel ++ * buffer. For dirty pages the write() method will be called ++ * regardless of having been retrieved previously. ++ * ++ * Added in FUSE protocol version 7.15. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param ino the inode number ++ * @param size the number of bytes to retrieve ++ * @param offset the starting offset into the file to retrieve from ++ * @param cookie user data to supply to the reply callback ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, ++ size_t size, off_t offset, void *cookie); ++ ++ ++/* ----------------------------------------------------------- * ++ * Utility functions * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Get the userdata from the request ++ * ++ * @param req request handle ++ * @return the user data passed to fuse_session_new() ++ */ ++void *fuse_req_userdata(fuse_req_t req); ++ ++/** ++ * Get the context from the request ++ * ++ * The pointer returned by this function will only be valid for the ++ * request's lifetime ++ * ++ * @param req request handle ++ * @return the context structure ++ */ ++const struct fuse_ctx *fuse_req_ctx(fuse_req_t req); ++ ++/** ++ * Get the current supplementary group IDs for the specified request ++ * ++ * Similar to the getgroups(2) system call, except the return value is ++ * always the total number of group IDs, even if it is larger than the ++ * specified size. ++ * ++ * The current fuse kernel module in linux (as of 2.6.30) doesn't pass ++ * the group list to userspace, hence this function needs to parse ++ * "/proc/$TID/task/$TID/status" to get the group IDs. ++ * ++ * This feature may not be supported on all operating systems. In ++ * such a case this function will return -ENOSYS. ++ * ++ * @param req request handle ++ * @param size size of given array ++ * @param list array of group IDs to be filled in ++ * @return the total number of supplementary group IDs or -errno on failure ++ */ ++int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]); ++ ++/** ++ * Callback function for an interrupt ++ * ++ * @param req interrupted request ++ * @param data user data ++ */ ++typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data); ++ ++/** ++ * Register/unregister callback for an interrupt ++ * ++ * If an interrupt has already happened, then the callback function is ++ * called from within this function, hence it's not possible for ++ * interrupts to be lost. ++ * ++ * @param req request handle ++ * @param func the callback function or NULL for unregister ++ * @param data user data passed to the callback function ++ */ ++void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, ++ void *data); ++ ++/** ++ * Check if a request has already been interrupted ++ * ++ * @param req request handle ++ * @return 1 if the request has been interrupted, 0 otherwise ++ */ ++int fuse_req_interrupted(fuse_req_t req); ++ ++ ++/* ----------------------------------------------------------- * ++ * Inquiry functions * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Print low-level version information to stdout. ++ */ ++void fuse_lowlevel_version(void); ++ ++/** ++ * Print available low-level options to stdout. This is not an ++ * exhaustive list, but includes only those options that may be of ++ * interest to an end-user of a file system. ++ */ ++void fuse_lowlevel_help(void); ++ ++/** ++ * Print available options for `fuse_parse_cmdline()`. ++ */ ++void fuse_cmdline_help(void); ++ ++/* ----------------------------------------------------------- * ++ * Filesystem setup & teardown * ++ * ----------------------------------------------------------- */ ++ ++struct fuse_cmdline_opts { ++ int singlethread; ++ int foreground; ++ int debug; ++ int nodefault_subtype; ++ char *mountpoint; ++ int show_version; ++ int show_help; ++ int clone_fd; ++ unsigned int max_idle_threads; ++}; ++ ++/** ++ * Utility function to parse common options for simple file systems ++ * using the low-level API. A help text that describes the available ++ * options can be printed with `fuse_cmdline_help`. A single ++ * non-option argument is treated as the mountpoint. Multiple ++ * non-option arguments will result in an error. ++ * ++ * If neither -o subtype= or -o fsname= options are given, a new ++ * subtype option will be added and set to the basename of the program ++ * (the fsname will remain unset, and then defaults to "fuse"). ++ * ++ * Known options will be removed from *args*, unknown options will ++ * remain. ++ * ++ * @param args argument vector (input+output) ++ * @param opts output argument for parsed options ++ * @return 0 on success, -1 on failure ++ */ ++int fuse_parse_cmdline(struct fuse_args *args, ++ struct fuse_cmdline_opts *opts); ++ ++/** ++ * Create a low level session. ++ * ++ * Returns a session structure suitable for passing to ++ * fuse_session_mount() and fuse_session_loop(). ++ * ++ * This function accepts most file-system independent mount options ++ * (like context, nodev, ro - see mount(8)), as well as the general ++ * fuse mount options listed in mount.fuse(8) (e.g. -o allow_root and ++ * -o default_permissions, but not ``-o use_ino``). Instead of `-o ++ * debug`, debugging may also enabled with `-d` or `--debug`. ++ * ++ * If not all options are known, an error message is written to stderr ++ * and the function returns NULL. ++ * ++ * Option parsing skips argv[0], which is assumed to contain the ++ * program name. To prevent accidentally passing an option in ++ * argv[0], this element must always be present (even if no options ++ * are specified). It may be set to the empty string ('\0') if no ++ * reasonable value can be provided. ++ * ++ * @param args argument vector ++ * @param op the (low-level) filesystem operations ++ * @param op_size sizeof(struct fuse_lowlevel_ops) ++ * @param userdata user data ++ * ++ * @return the fuse session on success, NULL on failure ++ **/ ++struct fuse_session *fuse_session_new(struct fuse_args *args, ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata); ++ ++/** ++ * Mount a FUSE file system. ++ * ++ * @param mountpoint the mount point path ++ * @param se session object ++ * ++ * @return 0 on success, -1 on failure. ++ **/ ++int fuse_session_mount(struct fuse_session *se, const char *mountpoint); ++ ++/** ++ * Enter a single threaded, blocking event loop. ++ * ++ * When the event loop terminates because the connection to the FUSE ++ * kernel module has been closed, this function returns zero. This ++ * happens when the filesystem is unmounted regularly (by the ++ * filesystem owner or root running the umount(8) or fusermount(1) ++ * command), or if connection is explicitly severed by writing ``1`` ++ * to the``abort`` file in ``/sys/fs/fuse/connections/NNN``. The only ++ * way to distinguish between these two conditions is to check if the ++ * filesystem is still mounted after the session loop returns. ++ * ++ * When some error occurs during request processing, the function ++ * returns a negated errno(3) value. ++ * ++ * If the loop has been terminated because of a signal handler ++ * installed by fuse_set_signal_handlers(), this function returns the ++ * (positive) signal value that triggered the exit. ++ * ++ * @param se the session ++ * @return 0, -errno, or a signal value ++ */ ++int fuse_session_loop(struct fuse_session *se); ++ ++/** ++ * Enter a multi-threaded event loop. ++ * ++ * For a description of the return value and the conditions when the ++ * event loop exits, refer to the documentation of ++ * fuse_session_loop(). ++ * ++ * @param se the session ++ * @param config session loop configuration ++ * @return see fuse_session_loop() ++ */ ++#if FUSE_USE_VERSION < 32 ++int fuse_session_loop_mt_31(struct fuse_session *se, int clone_fd); ++#define fuse_session_loop_mt(se, clone_fd) fuse_session_loop_mt_31(se, clone_fd) ++#else ++int fuse_session_loop_mt(struct fuse_session *se, struct fuse_loop_config *config); ++#endif ++ ++/** ++ * Flag a session as terminated. ++ * ++ * This function is invoked by the POSIX signal handlers, when ++ * registered using fuse_set_signal_handlers(). It will cause any ++ * running event loops to terminate on the next opportunity. ++ * ++ * @param se the session ++ */ ++void fuse_session_exit(struct fuse_session *se); ++ ++/** ++ * Reset the terminated flag of a session ++ * ++ * @param se the session ++ */ ++void fuse_session_reset(struct fuse_session *se); ++ ++/** ++ * Query the terminated flag of a session ++ * ++ * @param se the session ++ * @return 1 if exited, 0 if not exited ++ */ ++int fuse_session_exited(struct fuse_session *se); ++ ++/** ++ * Ensure that file system is unmounted. ++ * ++ * In regular operation, the file system is typically unmounted by the ++ * user calling umount(8) or fusermount(1), which then terminates the ++ * FUSE session loop. However, the session loop may also terminate as ++ * a result of an explicit call to fuse_session_exit() (e.g. by a ++ * signal handler installed by fuse_set_signal_handler()). In this ++ * case the filesystem remains mounted, but any attempt to access it ++ * will block (while the filesystem process is still running) or give ++ * an ESHUTDOWN error (after the filesystem process has terminated). ++ * ++ * If the communication channel with the FUSE kernel module is still ++ * open (i.e., if the session loop was terminated by an explicit call ++ * to fuse_session_exit()), this function will close it and unmount ++ * the filesystem. If the communication channel has been closed by the ++ * kernel, this method will do (almost) nothing. ++ * ++ * NOTE: The above semantics mean that if the connection to the kernel ++ * is terminated via the ``/sys/fs/fuse/connections/NNN/abort`` file, ++ * this method will *not* unmount the filesystem. ++ * ++ * @param se the session ++ */ ++void fuse_session_unmount(struct fuse_session *se); ++ ++/** ++ * Destroy a session ++ * ++ * @param se the session ++ */ ++void fuse_session_destroy(struct fuse_session *se); ++ ++/* ----------------------------------------------------------- * ++ * Custom event loop support * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Return file descriptor for communication with kernel. ++ * ++ * The file selector can be used to integrate FUSE with a custom event ++ * loop. Whenever data is available for reading on the provided fd, ++ * the event loop should call `fuse_session_receive_buf` followed by ++ * `fuse_session_process_buf` to process the request. ++ * ++ * The returned file descriptor is valid until `fuse_session_unmount` ++ * is called. ++ * ++ * @param se the session ++ * @return a file descriptor ++ */ ++int fuse_session_fd(struct fuse_session *se); ++ ++/** ++ * Process a raw request supplied in a generic buffer ++ * ++ * The fuse_buf may contain a memory buffer or a pipe file descriptor. ++ * ++ * @param se the session ++ * @param buf the fuse_buf containing the request ++ */ ++void fuse_session_process_buf(struct fuse_session *se, ++ const struct fuse_buf *buf); ++ ++/** ++ * Read a raw request from the kernel into the supplied buffer. ++ * ++ * Depending on file system options, system capabilities, and request ++ * size the request is either read into a memory buffer or spliced ++ * into a temporary pipe. ++ * ++ * @param se the session ++ * @param buf the fuse_buf to store the request in ++ * @return the actual size of the raw request, or -errno on error ++ */ ++int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_LOWLEVEL_H_ */ +diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h +new file mode 100644 +index 0000000..2f6663e +--- /dev/null ++++ b/tools/virtiofsd/fuse_misc.h +@@ -0,0 +1,59 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include ++ ++/* ++ Versioned symbols cannot be used in some cases because it ++ - confuse the dynamic linker in uClibc ++ - not supported on MacOSX (in MachO binary format) ++*/ ++#if (!defined(__UCLIBC__) && !defined(__APPLE__)) ++#define FUSE_SYMVER(x) __asm__(x) ++#else ++#define FUSE_SYMVER(x) ++#endif ++ ++#ifndef USE_UCLIBC ++#define fuse_mutex_init(mut) pthread_mutex_init(mut, NULL) ++#else ++/* Is this hack still needed? */ ++static inline void fuse_mutex_init(pthread_mutex_t *mut) ++{ ++ pthread_mutexattr_t attr; ++ pthread_mutexattr_init(&attr); ++ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); ++ pthread_mutex_init(mut, &attr); ++ pthread_mutexattr_destroy(&attr); ++} ++#endif ++ ++#ifdef HAVE_STRUCT_STAT_ST_ATIM ++/* Linux */ ++#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec) ++#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec) ++#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec) ++#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val) ++#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val) ++#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val) ++#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC) ++/* FreeBSD */ ++#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec) ++#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec) ++#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec) ++#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val) ++#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val) ++#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val) ++#else ++#define ST_ATIM_NSEC(stbuf) 0 ++#define ST_CTIM_NSEC(stbuf) 0 ++#define ST_MTIM_NSEC(stbuf) 0 ++#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0) ++#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0) ++#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0) ++#endif +diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h +new file mode 100644 +index 0000000..d8573e7 +--- /dev/null ++++ b/tools/virtiofsd/fuse_opt.h +@@ -0,0 +1,271 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_OPT_H_ ++#define FUSE_OPT_H_ ++ ++/** @file ++ * ++ * This file defines the option parsing interface of FUSE ++ */ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Option description ++ * ++ * This structure describes a single option, and action associated ++ * with it, in case it matches. ++ * ++ * More than one such match may occur, in which case the action for ++ * each match is executed. ++ * ++ * There are three possible actions in case of a match: ++ * ++ * i) An integer (int or unsigned) variable determined by 'offset' is ++ * set to 'value' ++ * ++ * ii) The processing function is called, with 'value' as the key ++ * ++ * iii) An integer (any) or string (char *) variable determined by ++ * 'offset' is set to the value of an option parameter ++ * ++ * 'offset' should normally be either set to ++ * ++ * - 'offsetof(struct foo, member)' actions i) and iii) ++ * ++ * - -1 action ii) ++ * ++ * The 'offsetof()' macro is defined in the header. ++ * ++ * The template determines which options match, and also have an ++ * effect on the action. Normally the action is either i) or ii), but ++ * if a format is present in the template, then action iii) is ++ * performed. ++ * ++ * The types of templates are: ++ * ++ * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only ++ * themselves. Invalid values are "--" and anything beginning ++ * with "-o" ++ * ++ * 2) "foo", "foo-bar", etc. These match "-ofoo", "-ofoo-bar" or ++ * the relevant option in a comma separated option list ++ * ++ * 3) "bar=", "--foo=", etc. These are variations of 1) and 2) ++ * which have a parameter ++ * ++ * 4) "bar=%s", "--foo=%lu", etc. Same matching as above but perform ++ * action iii). ++ * ++ * 5) "-x ", etc. Matches either "-xparam" or "-x param" as ++ * two separate arguments ++ * ++ * 6) "-x %s", etc. Combination of 4) and 5) ++ * ++ * If the format is "%s", memory is allocated for the string unlike with ++ * scanf(). The previous value (if non-NULL) stored at the this location is ++ * freed. ++ */ ++struct fuse_opt { ++ /** Matching template and optional parameter formatting */ ++ const char *templ; ++ ++ /** ++ * Offset of variable within 'data' parameter of fuse_opt_parse() ++ * or -1 ++ */ ++ unsigned long offset; ++ ++ /** ++ * Value to set the variable to, or to be passed as 'key' to the ++ * processing function. Ignored if template has a format ++ */ ++ int value; ++}; ++ ++/** ++ * Key option. In case of a match, the processing function will be ++ * called with the specified key. ++ */ ++#define FUSE_OPT_KEY(templ, key) { templ, -1U, key } ++ ++/** ++ * Last option. An array of 'struct fuse_opt' must end with a NULL ++ * template value ++ */ ++#define FUSE_OPT_END { NULL, 0, 0 } ++ ++/** ++ * Argument list ++ */ ++struct fuse_args { ++ /** Argument count */ ++ int argc; ++ ++ /** Argument vector. NULL terminated */ ++ char **argv; ++ ++ /** Is 'argv' allocated? */ ++ int allocated; ++}; ++ ++/** ++ * Initializer for 'struct fuse_args' ++ */ ++#define FUSE_ARGS_INIT(argc, argv) { argc, argv, 0 } ++ ++/** ++ * Key value passed to the processing function if an option did not ++ * match any template ++ */ ++#define FUSE_OPT_KEY_OPT -1 ++ ++/** ++ * Key value passed to the processing function for all non-options ++ * ++ * Non-options are the arguments beginning with a character other than ++ * '-' or all arguments after the special '--' option ++ */ ++#define FUSE_OPT_KEY_NONOPT -2 ++ ++/** ++ * Special key value for options to keep ++ * ++ * Argument is not passed to processing function, but behave as if the ++ * processing function returned 1 ++ */ ++#define FUSE_OPT_KEY_KEEP -3 ++ ++/** ++ * Special key value for options to discard ++ * ++ * Argument is not passed to processing function, but behave as if the ++ * processing function returned zero ++ */ ++#define FUSE_OPT_KEY_DISCARD -4 ++ ++/** ++ * Processing function ++ * ++ * This function is called if ++ * - option did not match any 'struct fuse_opt' ++ * - argument is a non-option ++ * - option did match and offset was set to -1 ++ * ++ * The 'arg' parameter will always contain the whole argument or ++ * option including the parameter if exists. A two-argument option ++ * ("-x foo") is always converted to single argument option of the ++ * form "-xfoo" before this function is called. ++ * ++ * Options of the form '-ofoo' are passed to this function without the ++ * '-o' prefix. ++ * ++ * The return value of this function determines whether this argument ++ * is to be inserted into the output argument vector, or discarded. ++ * ++ * @param data is the user data passed to the fuse_opt_parse() function ++ * @param arg is the whole argument or option ++ * @param key determines why the processing function was called ++ * @param outargs the current output argument list ++ * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept ++ */ ++typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, ++ struct fuse_args *outargs); ++ ++/** ++ * Option parsing function ++ * ++ * If 'args' was returned from a previous call to fuse_opt_parse() or ++ * it was constructed from ++ * ++ * A NULL 'args' is equivalent to an empty argument vector ++ * ++ * A NULL 'opts' is equivalent to an 'opts' array containing a single ++ * end marker ++ * ++ * A NULL 'proc' is equivalent to a processing function always ++ * returning '1' ++ * ++ * @param args is the input and output argument list ++ * @param data is the user data ++ * @param opts is the option description array ++ * @param proc is the processing function ++ * @return -1 on error, 0 on success ++ */ ++int fuse_opt_parse(struct fuse_args *args, void *data, ++ const struct fuse_opt opts[], fuse_opt_proc_t proc); ++ ++/** ++ * Add an option to a comma separated option list ++ * ++ * @param opts is a pointer to an option list, may point to a NULL value ++ * @param opt is the option to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_add_opt(char **opts, const char *opt); ++ ++/** ++ * Add an option, escaping commas, to a comma separated option list ++ * ++ * @param opts is a pointer to an option list, may point to a NULL value ++ * @param opt is the option to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_add_opt_escaped(char **opts, const char *opt); ++ ++/** ++ * Add an argument to a NULL terminated argument vector ++ * ++ * @param args is the structure containing the current argument list ++ * @param arg is the new argument to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_add_arg(struct fuse_args *args, const char *arg); ++ ++/** ++ * Add an argument at the specified position in a NULL terminated ++ * argument vector ++ * ++ * Adds the argument to the N-th position. This is useful for adding ++ * options at the beginning of the array which must not come after the ++ * special '--' option. ++ * ++ * @param args is the structure containing the current argument list ++ * @param pos is the position at which to add the argument ++ * @param arg is the new argument to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg); ++ ++/** ++ * Free the contents of argument list ++ * ++ * The structure itself is not freed ++ * ++ * @param args is the structure containing the argument list ++ */ ++void fuse_opt_free_args(struct fuse_args *args); ++ ++ ++/** ++ * Check if an option matches ++ * ++ * @param opts is the option description array ++ * @param opt is the option to match ++ * @return 1 if a match is found, 0 if not ++ */ ++int fuse_opt_match(const struct fuse_opt opts[], const char *opt); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_OPT_H_ */ +diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h +new file mode 100644 +index 0000000..6b77c33 +--- /dev/null ++++ b/tools/virtiofsd/passthrough_helpers.h +@@ -0,0 +1,76 @@ ++/* ++ * FUSE: Filesystem in Userspace ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE ++ */ ++ ++/* ++ * Creates files on the underlying file system in response to a FUSE_MKNOD ++ * operation ++ */ ++static int mknod_wrapper(int dirfd, const char *path, const char *link, ++ int mode, dev_t rdev) ++{ ++ int res; ++ ++ if (S_ISREG(mode)) { ++ res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); ++ if (res >= 0) ++ res = close(res); ++ } else if (S_ISDIR(mode)) { ++ res = mkdirat(dirfd, path, mode); ++ } else if (S_ISLNK(mode) && link != NULL) { ++ res = symlinkat(link, dirfd, path); ++ } else if (S_ISFIFO(mode)) { ++ res = mkfifoat(dirfd, path, mode); ++#ifdef __FreeBSD__ ++ } else if (S_ISSOCK(mode)) { ++ struct sockaddr_un su; ++ int fd; ++ ++ if (strlen(path) >= sizeof(su.sun_path)) { ++ errno = ENAMETOOLONG; ++ return -1; ++ } ++ fd = socket(AF_UNIX, SOCK_STREAM, 0); ++ if (fd >= 0) { ++ /* ++ * We must bind the socket to the underlying file ++ * system to create the socket file, even though ++ * we'll never listen on this socket. ++ */ ++ su.sun_family = AF_UNIX; ++ strncpy(su.sun_path, path, sizeof(su.sun_path)); ++ res = bindat(dirfd, fd, (struct sockaddr*)&su, ++ sizeof(su)); ++ if (res == 0) ++ close(fd); ++ } else { ++ res = -1; ++ } ++#endif ++ } else { ++ res = mknodat(dirfd, path, mode, rdev); ++ } ++ ++ return res; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Remove-fuse_req_getgroups.patch b/kvm-virtiofsd-Remove-fuse_req_getgroups.patch new file mode 100755 index 0000000..27e71f2 --- /dev/null +++ b/kvm-virtiofsd-Remove-fuse_req_getgroups.patch @@ -0,0 +1,193 @@ +From 7a1860c83ff042f3e796c449e780ee0528107213 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:08 +0000 +Subject: [PATCH 12/18] virtiofsd: Remove fuse_req_getgroups +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-2-dgilbert@redhat.com> +Patchwork-id: 94122 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/7] virtiofsd: Remove fuse_req_getgroups +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +Remove fuse_req_getgroups that's unused in virtiofsd; it came in +from libfuse but we don't actually use it. It was called from +fuse_getgroups which we previously removed (but had left it's header +in). + +Coverity had complained about null termination in it, but removing +it is the easiest answer. + +Fixes: Coverity CID: 1413117 (String not null terminated) +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 988717b46b6424907618cb845ace9d69062703af) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse.h | 20 ----------- + tools/virtiofsd/fuse_lowlevel.c | 77 ----------------------------------------- + tools/virtiofsd/fuse_lowlevel.h | 21 ----------- + 3 files changed, 118 deletions(-) + +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +index 7a4c713..aba13fe 100644 +--- a/tools/virtiofsd/fuse.h ++++ b/tools/virtiofsd/fuse.h +@@ -1007,26 +1007,6 @@ void fuse_exit(struct fuse *f); + struct fuse_context *fuse_get_context(void); + + /** +- * Get the current supplementary group IDs for the current request +- * +- * Similar to the getgroups(2) system call, except the return value is +- * always the total number of group IDs, even if it is larger than the +- * specified size. +- * +- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass +- * the group list to userspace, hence this function needs to parse +- * "/proc/$TID/task/$TID/status" to get the group IDs. +- * +- * This feature may not be supported on all operating systems. In +- * such a case this function will return -ENOSYS. +- * +- * @param size size of given array +- * @param list array of group IDs to be filled in +- * @return the total number of supplementary group IDs or -errno on failure +- */ +-int fuse_getgroups(int size, gid_t list[]); +- +-/** + * Check if the current request has already been interrupted + * + * @return 1 if the request has been interrupted, 0 otherwise +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index de2e2e0..01c418a 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2667,83 +2667,6 @@ int fuse_lowlevel_is_virtio(struct fuse_session *se) + return !!se->virtio_dev; + } + +-#ifdef linux +-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) +-{ +- char *buf; +- size_t bufsize = 1024; +- char path[128]; +- int ret; +- int fd; +- unsigned long pid = req->ctx.pid; +- char *s; +- +- sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); +- +-retry: +- buf = malloc(bufsize); +- if (buf == NULL) { +- return -ENOMEM; +- } +- +- ret = -EIO; +- fd = open(path, O_RDONLY); +- if (fd == -1) { +- goto out_free; +- } +- +- ret = read(fd, buf, bufsize); +- close(fd); +- if (ret < 0) { +- ret = -EIO; +- goto out_free; +- } +- +- if ((size_t)ret == bufsize) { +- free(buf); +- bufsize *= 4; +- goto retry; +- } +- +- ret = -EIO; +- s = strstr(buf, "\nGroups:"); +- if (s == NULL) { +- goto out_free; +- } +- +- s += 8; +- ret = 0; +- while (1) { +- char *end; +- unsigned long val = strtoul(s, &end, 0); +- if (end == s) { +- break; +- } +- +- s = end; +- if (ret < size) { +- list[ret] = val; +- } +- ret++; +- } +- +-out_free: +- free(buf); +- return ret; +-} +-#else /* linux */ +-/* +- * This is currently not implemented on other than Linux... +- */ +-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) +-{ +- (void)req; +- (void)size; +- (void)list; +- return -ENOSYS; +-} +-#endif +- + void fuse_session_exit(struct fuse_session *se) + { + se->exited = 1; +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 138041e..8f6d705 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1705,27 +1705,6 @@ void *fuse_req_userdata(fuse_req_t req); + const struct fuse_ctx *fuse_req_ctx(fuse_req_t req); + + /** +- * Get the current supplementary group IDs for the specified request +- * +- * Similar to the getgroups(2) system call, except the return value is +- * always the total number of group IDs, even if it is larger than the +- * specified size. +- * +- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass +- * the group list to userspace, hence this function needs to parse +- * "/proc/$TID/task/$TID/status" to get the group IDs. +- * +- * This feature may not be supported on all operating systems. In +- * such a case this function will return -ENOSYS. +- * +- * @param req request handle +- * @param size size of given array +- * @param list array of group IDs to be filled in +- * @return the total number of supplementary group IDs or -errno on failure +- */ +-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]); +- +-/** + * Callback function for an interrupt + * + * @param req interrupted request +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch b/kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch new file mode 100755 index 0000000..7f9c5bb --- /dev/null +++ b/kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch @@ -0,0 +1,271 @@ +From 80237df2b22eca685037456e65d149fed4654165 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:48 +0100 +Subject: [PATCH 017/116] virtiofsd: Remove unused enum fuse_buf_copy_flags +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-14-dgilbert@redhat.com> +Patchwork-id: 93465 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 013/112] virtiofsd: Remove unused enum fuse_buf_copy_flags +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Xiao Yang + +Signed-off-by: Xiao Yang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8c3fe75e0308ba2f01d160ace534b7e386cea808) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 7 +++--- + tools/virtiofsd/fuse_common.h | 46 +--------------------------------------- + tools/virtiofsd/fuse_lowlevel.c | 13 +++++------- + tools/virtiofsd/fuse_lowlevel.h | 35 ++---------------------------- + tools/virtiofsd/passthrough_ll.c | 4 ++-- + 5 files changed, 13 insertions(+), 92 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 5df946c..4d507f3 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -171,7 +171,7 @@ static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, + + static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) ++ size_t len) + { + int src_is_fd = src->flags & FUSE_BUF_IS_FD; + int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; +@@ -224,8 +224,7 @@ static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) + return 1; + } + +-ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, +- enum fuse_buf_copy_flags flags) ++ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) + { + size_t copied = 0; + +@@ -249,7 +248,7 @@ ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, + dst_len = dst->size - dstv->off; + len = min_size(src_len, dst_len); + +- res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); ++ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len); + if (res < 0) { + if (!copied) { + return res; +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index bd9bf86..0cb33ac 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -605,48 +605,6 @@ enum fuse_buf_flags { + }; + + /** +- * Buffer copy flags +- */ +-enum fuse_buf_copy_flags { +- /** +- * Don't use splice(2) +- * +- * Always fall back to using read and write instead of +- * splice(2) to copy data from one file descriptor to another. +- * +- * If this flag is not set, then only fall back if splice is +- * unavailable. +- */ +- FUSE_BUF_NO_SPLICE = (1 << 1), +- +- /** +- * Force splice +- * +- * Always use splice(2) to copy data from one file descriptor +- * to another. If splice is not available, return -EINVAL. +- */ +- FUSE_BUF_FORCE_SPLICE = (1 << 2), +- +- /** +- * Try to move data with splice. +- * +- * If splice is used, try to move pages from the source to the +- * destination instead of copying. See documentation of +- * SPLICE_F_MOVE in splice(2) man page. +- */ +- FUSE_BUF_SPLICE_MOVE = (1 << 3), +- +- /** +- * Don't block on the pipe when copying data with splice +- * +- * Makes the operations on the pipe non-blocking (if the pipe +- * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) +- * man page. +- */ +- FUSE_BUF_SPLICE_NONBLOCK = (1 << 4), +-}; +- +-/** + * Single data buffer + * + * Generic data buffer for I/O, extended attributes, etc... Data may +@@ -741,11 +699,9 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv); + * + * @param dst destination buffer vector + * @param src source buffer vector +- * @param flags flags controlling the copy + * @return actual number of bytes copied or -errno on error + */ +-ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src, +- enum fuse_buf_copy_flags flags); ++ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src); + + /* + * Signal handling +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index eb0ec49..3da80de 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -490,16 +490,14 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + + static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, unsigned int flags) ++ struct fuse_bufvec *buf) + { + size_t len = fuse_buf_size(buf); +- (void)flags; + + return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); + } + +-int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) + { + struct iovec iov[2]; + struct fuse_out_header out; +@@ -511,7 +509,7 @@ int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, + out.unique = req->unique; + out.error = 0; + +- res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); ++ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv); + if (res <= 0) { + fuse_free_req(req); + return res; +@@ -1969,8 +1967,7 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + } + + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++ off_t offset, struct fuse_bufvec *bufv) + { + struct fuse_out_header out; + struct fuse_notify_store_out outarg; +@@ -1999,7 +1996,7 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + +- res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); ++ res = fuse_send_data_iov(se, NULL, iov, 2, bufv); + if (res > 0) { + res = -res; + } +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 12a84b4..2fa225d 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1363,33 +1363,6 @@ int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + /** + * Reply with data copied/moved from buffer(s) + * +- * Zero copy data transfer ("splicing") will be used under +- * the following circumstances: +- * +- * 1. FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.want, and +- * 2. the kernel supports splicing from the fuse device +- * (FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.capable), and +- * 3. *flags* does not contain FUSE_BUF_NO_SPLICE +- * 4. The amount of data that is provided in file-descriptor backed +- * buffers (i.e., buffers for which bufv[n].flags == FUSE_BUF_FD) +- * is at least twice the page size. +- * +- * In order for SPLICE_F_MOVE to be used, the following additional +- * conditions have to be fulfilled: +- * +- * 1. FUSE_CAP_SPLICE_MOVE is set in fuse_conn_info.want, and +- * 2. the kernel supports it (i.e, FUSE_CAP_SPLICE_MOVE is set in +- fuse_conn_info.capable), and +- * 3. *flags* contains FUSE_BUF_SPLICE_MOVE +- * +- * Note that, if splice is used, the data is actually spliced twice: +- * once into a temporary pipe (to prepend header data), and then again +- * into the kernel. If some of the provided buffers are memory-backed, +- * the data in them is copied in step one and spliced in step two. +- * +- * The FUSE_BUF_SPLICE_FORCE_SPLICE and FUSE_BUF_SPLICE_NONBLOCK flags +- * are silently ignored. +- * + * Possible requests: + * read, readdir, getxattr, listxattr + * +@@ -1400,11 +1373,9 @@ int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + * + * @param req request handle + * @param bufv buffer vector +- * @param flags flags controlling the copy + * @return zero for success, -errno for failure to send reply + */ +-int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv); + + /** + * Reply with data vector +@@ -1705,12 +1676,10 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + * @param ino the inode number + * @param offset the starting offset into the file to store to + * @param bufv buffer vector +- * @param flags flags controlling the copy + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++ off_t offset, struct fuse_bufvec *bufv); + + /* + * Utility functions +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9377718..126a56c 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -931,7 +931,7 @@ static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + buf.buf[0].fd = fi->fh; + buf.buf[0].pos = offset; + +- fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); ++ fuse_reply_data(req, &buf); + } + + static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, +@@ -952,7 +952,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + out_buf.buf[0].size, (unsigned long)off); + } + +- res = fuse_buf_copy(&out_buf, in_buf, 0); ++ res = fuse_buf_copy(&out_buf, in_buf); + if (res < 0) { + fuse_reply_err(req, -res); + } else { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch b/kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch new file mode 100755 index 0000000..e1a3cd1 --- /dev/null +++ b/kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch @@ -0,0 +1,72 @@ +From b8d62021f28114f054571b96ec0cd4dad4476923 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:14 +0100 +Subject: [PATCH 103/116] virtiofsd: Reset O_DIRECT flag during file open +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-100-dgilbert@redhat.com> +Patchwork-id: 93553 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 099/112] virtiofsd: Reset O_DIRECT flag during file open +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +If an application wants to do direct IO and opens a file with O_DIRECT +in guest, that does not necessarily mean that we need to bypass page +cache on host as well. So reset this flag on host. + +If somebody needs to bypass page cache on host as well (and it is safe to +do so), we can add a knob in daemon later to control this behavior. + +I check virtio-9p and they do reset O_DIRECT flag. + +Signed-off-by: Vivek Goyal +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 65da4539803373ec4eec97ffc49ee90083e56efd) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ccbbec1..948cb19 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1721,6 +1721,13 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out; + } + ++ /* ++ * O_DIRECT in guest should not necessarily mean bypassing page ++ * cache on host as well. If somebody needs that behavior, it ++ * probably should be a configuration knob in daemon. ++ */ ++ fi->flags &= ~O_DIRECT; ++ + fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); + err = fd == -1 ? errno : 0; +@@ -1950,6 +1957,13 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + fi->flags &= ~O_APPEND; + } + ++ /* ++ * O_DIRECT in guest should not necessarily mean bypassing page ++ * cache on host as well. If somebody needs that behavior, it ++ * probably should be a configuration knob in daemon. ++ */ ++ fi->flags &= ~O_DIRECT; ++ + sprintf(buf, "%i", lo_fd(req, ino)); + fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); + if (fd == -1) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Send-replies-to-messages.patch b/kvm-virtiofsd-Send-replies-to-messages.patch new file mode 100755 index 0000000..5453fda --- /dev/null +++ b/kvm-virtiofsd-Send-replies-to-messages.patch @@ -0,0 +1,199 @@ +From bb1f691dc410ce11ac9675ced70e78a3ce2511b0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:03 +0100 +Subject: [PATCH 032/116] virtiofsd: Send replies to messages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-29-dgilbert@redhat.com> +Patchwork-id: 93485 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 028/112] virtiofsd: Send replies to messages +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Route fuse out messages back through the same queue elements +that had the command that triggered the request. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit df57ba919ec3edef9cc208d35685095e6e92713e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 4 ++ + tools/virtiofsd/fuse_virtio.c | 107 ++++++++++++++++++++++++++++++++++++++-- + tools/virtiofsd/fuse_virtio.h | 4 ++ + 3 files changed, 111 insertions(+), 4 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index af09fa2..380d93b 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -171,6 +171,10 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + } + } + ++ if (fuse_lowlevel_is_virtio(se)) { ++ return virtio_send_msg(se, ch, iov, count); ++ } ++ + abort(); /* virtio should have taken it before here */ + return 0; + } +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 3841b20..05d0e29 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -41,6 +41,9 @@ struct fv_QueueInfo { + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; ++ ++ /* The element for the command currently being processed */ ++ VuVirtqElement *qe; + }; + + /* +@@ -121,6 +124,105 @@ static void copy_from_iov(struct fuse_buf *buf, size_t out_num, + } + } + ++/* ++ * Copy from one iov to another, the given number of bytes ++ * The caller must have checked sizes. ++ */ ++static void copy_iov(struct iovec *src_iov, int src_count, ++ struct iovec *dst_iov, int dst_count, size_t to_copy) ++{ ++ size_t dst_offset = 0; ++ /* Outer loop copies 'src' elements */ ++ while (to_copy) { ++ assert(src_count); ++ size_t src_len = src_iov[0].iov_len; ++ size_t src_offset = 0; ++ ++ if (src_len > to_copy) { ++ src_len = to_copy; ++ } ++ /* Inner loop copies contents of one 'src' to maybe multiple dst. */ ++ while (src_len) { ++ assert(dst_count); ++ size_t dst_len = dst_iov[0].iov_len - dst_offset; ++ if (dst_len > src_len) { ++ dst_len = src_len; ++ } ++ ++ memcpy(dst_iov[0].iov_base + dst_offset, ++ src_iov[0].iov_base + src_offset, dst_len); ++ src_len -= dst_len; ++ to_copy -= dst_len; ++ src_offset += dst_len; ++ dst_offset += dst_len; ++ ++ assert(dst_offset <= dst_iov[0].iov_len); ++ if (dst_offset == dst_iov[0].iov_len) { ++ dst_offset = 0; ++ dst_iov++; ++ dst_count--; ++ } ++ } ++ src_iov++; ++ src_count--; ++ } ++} ++ ++/* ++ * Called back by ll whenever it wants to send a reply/message back ++ * The 1st element of the iov starts with the fuse_out_header ++ * 'unique'==0 means it's a notify message. ++ */ ++int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count) ++{ ++ VuVirtqElement *elem; ++ VuVirtq *q; ++ ++ assert(count >= 1); ++ assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); ++ ++ struct fuse_out_header *out = iov[0].iov_base; ++ /* TODO: Endianness! */ ++ ++ size_t tosend_len = iov_size(iov, count); ++ ++ /* unique == 0 is notification, which we don't support */ ++ assert(out->unique); ++ /* For virtio we always have ch */ ++ assert(ch); ++ elem = ch->qi->qe; ++ q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ ++ /* The 'in' part of the elem is to qemu */ ++ unsigned int in_num = elem->in_num; ++ struct iovec *in_sg = elem->in_sg; ++ size_t in_len = iov_size(in_sg, in_num); ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", ++ __func__, elem->index, in_num, in_len); ++ ++ /* ++ * The elem should have room for a 'fuse_out_header' (out from fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (in_len < sizeof(struct fuse_out_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", ++ __func__, elem->index); ++ return -E2BIG; ++ } ++ if (in_len < tosend_len) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", ++ __func__, elem->index, tosend_len); ++ return -E2BIG; ++ } ++ ++ copy_iov(iov, count, in_sg, in_num, tosend_len); ++ vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); ++ vu_queue_notify(&se->virtio_dev->dev, q); ++ ++ return 0; ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { +@@ -226,13 +328,10 @@ static void *fv_queue_thread(void *opaque) + + /* TODO! Endianness of header */ + +- /* TODO: Fixup fuse_send_msg */ + /* TODO: Add checks for fuse_session_exited */ + fuse_session_process_buf_int(se, &fbuf, &ch); + +- /* TODO: vu_queue_push(dev, q, elem, qi->write_count); */ +- vu_queue_notify(dev, q); +- ++ qi->qe = NULL; + free(elem); + elem = NULL; + } +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index 23026d6..135a148 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -22,4 +22,8 @@ int virtio_session_mount(struct fuse_session *se); + + int virtio_loop(struct fuse_session *se); + ++ ++int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Show-submounts.patch b/kvm-virtiofsd-Show-submounts.patch new file mode 100755 index 0000000..d45a030 --- /dev/null +++ b/kvm-virtiofsd-Show-submounts.patch @@ -0,0 +1,51 @@ +From 717373379510cd6ecf8c6d0e1aae65edfac4551d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:58 +0100 +Subject: [PATCH 7/9] virtiofsd: Show submounts + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-6-dgilbert@redhat.com> +Patchwork-id: 96273 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 5/7] virtiofsd: Show submounts +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Max Reitz + +Currently, setup_mounts() bind-mounts the shared directory without +MS_REC. This makes all submounts disappear. + +Pass MS_REC so that the guest can see submounts again. + +Fixes: 5baa3b8e95064c2434bd9e2f312edd5e9ae275dc +Signed-off-by: Max Reitz +Message-Id: <20200424133516.73077-1-mreitz@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert + Changed Fixes to point to the commit with the problem rather than + the commit that turned it on +(cherry picked from commit ace0829c0d08f0e5f1451e402e94495bc2166772) + +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 73d8405..614ba55 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2670,7 +2670,7 @@ static void setup_mounts(const char *source) + int oldroot; + int newroot; + +- if (mount(source, source, NULL, MS_BIND, NULL) < 0) { ++ if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); + exit(1); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Start-queue-threads.patch b/kvm-virtiofsd-Start-queue-threads.patch new file mode 100755 index 0000000..8b03cd6 --- /dev/null +++ b/kvm-virtiofsd-Start-queue-threads.patch @@ -0,0 +1,165 @@ +From 38282d996cde61261211160577b366b83cad8012 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:00 +0100 +Subject: [PATCH 029/116] virtiofsd: Start queue threads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-26-dgilbert@redhat.com> +Patchwork-id: 93479 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 025/112] virtiofsd: Start queue threads +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Start a thread for each queue when we get notified it's been started. + +Signed-off-by: Dr. David Alan Gilbert +fix by: +Signed-off-by: Jun Piao +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e4c55a3c144493b436e40031e2eed61a84eca47b) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 89 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 89 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 4819e56..2a94bb3 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -11,6 +11,7 @@ + * See the file COPYING.LIB + */ + ++#include "qemu/osdep.h" + #include "fuse_virtio.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" +@@ -30,6 +31,15 @@ + + #include "contrib/libvhost-user/libvhost-user.h" + ++struct fv_QueueInfo { ++ pthread_t thread; ++ struct fv_VuDev *virtio_dev; ++ ++ /* Our queue index, corresponds to array position */ ++ int qidx; ++ int kick_fd; ++}; ++ + /* + * We pass the dev element into libvhost-user + * and then use it to get back to the outer +@@ -38,6 +48,13 @@ + struct fv_VuDev { + VuDev dev; + struct fuse_session *se; ++ ++ /* ++ * The following pair of fields are only accessed in the main ++ * virtio_loop ++ */ ++ size_t nqueues; ++ struct fv_QueueInfo **qi; + }; + + /* From spec */ +@@ -83,6 +100,75 @@ static void fv_panic(VuDev *dev, const char *err) + exit(EXIT_FAILURE); + } + ++static void *fv_queue_thread(void *opaque) ++{ ++ struct fv_QueueInfo *qi = opaque; ++ fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, ++ qi->qidx, qi->kick_fd); ++ while (1) { ++ /* TODO */ ++ } ++ ++ return NULL; ++} ++ ++/* Callback from libvhost-user on start or stop of a queue */ ++static void fv_queue_set_started(VuDev *dev, int qidx, bool started) ++{ ++ struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev); ++ struct fv_QueueInfo *ourqi; ++ ++ fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx, ++ started); ++ assert(qidx >= 0); ++ ++ /* ++ * Ignore additional request queues for now. passthrough_ll.c must be ++ * audited for thread-safety issues first. It was written with a ++ * well-behaved client in mind and may not protect against all types of ++ * races yet. ++ */ ++ if (qidx > 1) { ++ fuse_log(FUSE_LOG_ERR, ++ "%s: multiple request queues not yet implemented, please only " ++ "configure 1 request queue\n", ++ __func__); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (started) { ++ /* Fire up a thread to watch this queue */ ++ if (qidx >= vud->nqueues) { ++ vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0])); ++ assert(vud->qi); ++ memset(vud->qi + vud->nqueues, 0, ++ sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues))); ++ vud->nqueues = qidx + 1; ++ } ++ if (!vud->qi[qidx]) { ++ vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1); ++ assert(vud->qi[qidx]); ++ vud->qi[qidx]->virtio_dev = vud; ++ vud->qi[qidx]->qidx = qidx; ++ } else { ++ /* Shouldn't have been started */ ++ assert(vud->qi[qidx]->kick_fd == -1); ++ } ++ ourqi = vud->qi[qidx]; ++ ourqi->kick_fd = dev->vq[qidx].kick_fd; ++ if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { ++ fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", ++ __func__, qidx); ++ assert(0); ++ } ++ } else { ++ /* TODO: Kill the thread */ ++ assert(qidx < vud->nqueues); ++ ourqi = vud->qi[qidx]; ++ ourqi->kick_fd = -1; ++ } ++} ++ + static bool fv_queue_order(VuDev *dev, int qidx) + { + return false; +@@ -92,6 +178,9 @@ static const VuDevIface fv_iface = { + .get_features = fv_get_features, + .set_features = fv_set_features, + ++ /* Don't need process message, we've not got any at vhost-user level */ ++ .queue_set_started = fv_queue_set_started, ++ + .queue_is_processed_in_order = fv_queue_order, + }; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Start-reading-commands-from-queue.patch b/kvm-virtiofsd-Start-reading-commands-from-queue.patch new file mode 100755 index 0000000..2022480 --- /dev/null +++ b/kvm-virtiofsd-Start-reading-commands-from-queue.patch @@ -0,0 +1,200 @@ +From b4af2eff8ecadb4e2c9520602455f77fac2cb943 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:02 +0100 +Subject: [PATCH 031/116] virtiofsd: Start reading commands from queue +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-28-dgilbert@redhat.com> +Patchwork-id: 93484 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 027/112] virtiofsd: Start reading commands from queue +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Pop queue elements off queues, copy the data from them and +pass that to fuse. + + Note: 'out' in a VuVirtqElement is from QEMU + 'in' in libfuse is into the daemon + + So we read from the out iov's to get a fuse_in_header + +When we get a kick we've got to read all the elements until the queue +is empty. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b509e1228b3e5eb83c14819045988999fc2dbd1b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 2 + + tools/virtiofsd/fuse_virtio.c | 99 +++++++++++++++++++++++++++++++++++++++++-- + 2 files changed, 98 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index ec04449..1126723 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -14,6 +14,7 @@ + #include "fuse_lowlevel.h" + + struct fv_VuDev; ++struct fv_QueueInfo; + + struct fuse_req { + struct fuse_session *se; +@@ -75,6 +76,7 @@ struct fuse_chan { + pthread_mutex_t lock; + int ctr; + int fd; ++ struct fv_QueueInfo *qi; + }; + + /** +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 05e7258..3841b20 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -12,6 +12,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/iov.h" + #include "fuse_virtio.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" +@@ -32,6 +33,7 @@ + + #include "contrib/libvhost-user/libvhost-user.h" + ++struct fv_VuDev; + struct fv_QueueInfo { + pthread_t thread; + struct fv_VuDev *virtio_dev; +@@ -101,10 +103,41 @@ static void fv_panic(VuDev *dev, const char *err) + exit(EXIT_FAILURE); + } + ++/* ++ * Copy from an iovec into a fuse_buf (memory only) ++ * Caller must ensure there is space ++ */ ++static void copy_from_iov(struct fuse_buf *buf, size_t out_num, ++ const struct iovec *out_sg) ++{ ++ void *dest = buf->mem; ++ ++ while (out_num) { ++ size_t onelen = out_sg->iov_len; ++ memcpy(dest, out_sg->iov_base, onelen); ++ dest += onelen; ++ out_sg++; ++ out_num--; ++ } ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { + struct fv_QueueInfo *qi = opaque; ++ struct VuDev *dev = &qi->virtio_dev->dev; ++ struct VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ struct fuse_session *se = qi->virtio_dev->se; ++ struct fuse_chan ch; ++ struct fuse_buf fbuf; ++ ++ fbuf.mem = NULL; ++ fbuf.flags = 0; ++ ++ fuse_mutex_init(&ch.lock); ++ ch.fd = (int)0xdaff0d111; ++ ch.qi = qi; ++ + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { +@@ -141,11 +174,71 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); + break; + } +- if (qi->virtio_dev->se->debug) { +- fprintf(stderr, "%s: Queue %d gave evalue: %zx\n", __func__, +- qi->qidx, (size_t)evalue); ++ /* out is from guest, in is too guest */ ++ unsigned int in_bytes, out_bytes; ++ vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", ++ __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); ++ ++ while (1) { ++ /* ++ * An element contains one request and the space to send our ++ * response They're spread over multiple descriptors in a ++ * scatter/gather set and we can't trust the guest to keep them ++ * still; so copy in/out. ++ */ ++ VuVirtqElement *elem = vu_queue_pop(dev, q, sizeof(VuVirtqElement)); ++ if (!elem) { ++ break; ++ } ++ ++ if (!fbuf.mem) { ++ fbuf.mem = malloc(se->bufsize); ++ assert(fbuf.mem); ++ assert(se->bufsize > sizeof(struct fuse_in_header)); ++ } ++ /* The 'out' part of the elem is from qemu */ ++ unsigned int out_num = elem->out_num; ++ struct iovec *out_sg = elem->out_sg; ++ size_t out_len = iov_size(out_sg, out_num); ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: elem %d: with %d out desc of length %zd\n", __func__, ++ elem->index, out_num, out_len); ++ ++ /* ++ * The elem should contain a 'fuse_in_header' (in to fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (out_len < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", ++ __func__, elem->index); ++ assert(0); /* TODO */ ++ } ++ if (out_len > se->bufsize) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", ++ __func__, elem->index); ++ assert(0); /* TODO */ ++ } ++ copy_from_iov(&fbuf, out_num, out_sg); ++ fbuf.size = out_len; ++ ++ /* TODO! Endianness of header */ ++ ++ /* TODO: Fixup fuse_send_msg */ ++ /* TODO: Add checks for fuse_session_exited */ ++ fuse_session_process_buf_int(se, &fbuf, &ch); ++ ++ /* TODO: vu_queue_push(dev, q, elem, qi->write_count); */ ++ vu_queue_notify(dev, q); ++ ++ free(elem); ++ elem = NULL; + } + } ++ pthread_mutex_destroy(&ch.lock); ++ free(fbuf.mem); + + return NULL; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Start-wiring-up-vhost-user.patch b/kvm-virtiofsd-Start-wiring-up-vhost-user.patch new file mode 100755 index 0000000..7b50118 --- /dev/null +++ b/kvm-virtiofsd-Start-wiring-up-vhost-user.patch @@ -0,0 +1,247 @@ +From 020f593031b0b54e4c35faffea489b700aed6a72 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:57 +0100 +Subject: [PATCH 026/116] virtiofsd: Start wiring up vhost-user +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-23-dgilbert@redhat.com> +Patchwork-id: 93477 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 022/112] virtiofsd: Start wiring up vhost-user +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Listen on our unix socket for the connection from QEMU, when we get it +initialise vhost-user and dive into our own loop variant (currently +dummy). + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f6f3573c6f271af5ded63ce28589a113f7205c72) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 4 ++ + tools/virtiofsd/fuse_lowlevel.c | 5 +++ + tools/virtiofsd/fuse_lowlevel.h | 7 ++++ + tools/virtiofsd/fuse_virtio.c | 87 +++++++++++++++++++++++++++++++++++++++- + tools/virtiofsd/fuse_virtio.h | 2 + + tools/virtiofsd/passthrough_ll.c | 7 +--- + 6 files changed, 106 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 82d6ac7..ec04449 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -13,6 +13,8 @@ + #include "fuse.h" + #include "fuse_lowlevel.h" + ++struct fv_VuDev; ++ + struct fuse_req { + struct fuse_session *se; + uint64_t unique; +@@ -65,6 +67,8 @@ struct fuse_session { + size_t bufsize; + int error; + char *vu_socket_path; ++ int vu_socketfd; ++ struct fv_VuDev *virtio_dev; + }; + + struct fuse_chan { +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 5df124e..af09fa2 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2242,6 +2242,11 @@ void fuse_session_unmount(struct fuse_session *se) + { + } + ++int fuse_lowlevel_is_virtio(struct fuse_session *se) ++{ ++ return se->vu_socket_path != NULL; ++} ++ + #ifdef linux + int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) + { +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 2fa225d..f6b3470 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1755,6 +1755,13 @@ void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + */ + int fuse_req_interrupted(fuse_req_t req); + ++/** ++ * Check if the session is connected via virtio ++ * ++ * @param se session object ++ * @return 1 if the session is a virtio session ++ */ ++int fuse_lowlevel_is_virtio(struct fuse_session *se); + + /* + * Inquiry functions +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index cbef6ff..2ae3c76 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -19,18 +19,78 @@ + + #include + #include ++#include + #include + #include + #include + #include + #include + ++#include "contrib/libvhost-user/libvhost-user.h" ++ ++/* ++ * We pass the dev element into libvhost-user ++ * and then use it to get back to the outer ++ * container for other data. ++ */ ++struct fv_VuDev { ++ VuDev dev; ++ struct fuse_session *se; ++}; ++ + /* From spec */ + struct virtio_fs_config { + char tag[36]; + uint32_t num_queues; + }; + ++/* ++ * Callback from libvhost-user if there's a new fd we're supposed to listen ++ * to, typically a queue kick? ++ */ ++static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb, ++ void *data) ++{ ++ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); ++} ++ ++/* ++ * Callback from libvhost-user if we're no longer supposed to listen on an fd ++ */ ++static void fv_remove_watch(VuDev *dev, int fd) ++{ ++ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); ++} ++ ++/* Callback from libvhost-user to panic */ ++static void fv_panic(VuDev *dev, const char *err) ++{ ++ fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err); ++ /* TODO: Allow reconnects?? */ ++ exit(EXIT_FAILURE); ++} ++ ++static bool fv_queue_order(VuDev *dev, int qidx) ++{ ++ return false; ++} ++ ++static const VuDevIface fv_iface = { ++ /* TODO: Add other callbacks */ ++ .queue_is_processed_in_order = fv_queue_order, ++}; ++ ++int virtio_loop(struct fuse_session *se) ++{ ++ fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__); ++ ++ while (1) { ++ /* TODO: Add stuffing */ ++ } ++ ++ fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); ++} ++ + int virtio_session_mount(struct fuse_session *se) + { + struct sockaddr_un un; +@@ -75,5 +135,30 @@ int virtio_session_mount(struct fuse_session *se) + return -1; + } + +- return -1; ++ fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n", ++ __func__); ++ int data_sock = accept(listen_sock, NULL, NULL); ++ if (data_sock == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n"); ++ close(listen_sock); ++ return -1; ++ } ++ close(listen_sock); ++ fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n", ++ __func__); ++ ++ /* TODO: Some cleanup/deallocation! */ ++ se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1); ++ if (!se->virtio_dev) { ++ fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__); ++ close(data_sock); ++ return -1; ++ } ++ ++ se->vu_socketfd = data_sock; ++ se->virtio_dev->se = se; ++ vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch, ++ fv_remove_watch, &fv_iface); ++ ++ return 0; + } +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index 8f2edb6..23026d6 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -20,4 +20,6 @@ struct fuse_session; + + int virtio_session_mount(struct fuse_session *se); + ++int virtio_loop(struct fuse_session *se); ++ + #endif +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index fc9b264..037c5d7 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -36,6 +36,7 @@ + */ + + #include "qemu/osdep.h" ++#include "fuse_virtio.h" + #include "fuse_lowlevel.h" + #include + #include +@@ -1395,11 +1396,7 @@ int main(int argc, char *argv[]) + fuse_daemonize(opts.foreground); + + /* Block until ctrl+c or fusermount -u */ +- if (opts.singlethread) { +- ret = fuse_session_loop(se); +- } else { +- ret = fuse_session_loop_mt(se, opts.clone_fd); +- } ++ ret = virtio_loop(se); + + fuse_session_unmount(se); + err_out3: +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Support-remote-posix-locks.patch b/kvm-virtiofsd-Support-remote-posix-locks.patch new file mode 100755 index 0000000..e60364a --- /dev/null +++ b/kvm-virtiofsd-Support-remote-posix-locks.patch @@ -0,0 +1,355 @@ +From 8e46d0862c4c204f92c08ce2ae961921f270efb5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:03 +0100 +Subject: [PATCH 092/116] virtiofsd: Support remote posix locks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-89-dgilbert@redhat.com> +Patchwork-id: 93537 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 088/112] virtiofsd: Support remote posix locks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +Doing posix locks with-in guest kernel are not sufficient if a file/dir +is being shared by multiple guests. So we need the notion of daemon doing +the locks which are visible to rest of the guests. + +Given posix locks are per process, one can not call posix lock API on host, +otherwise bunch of basic posix locks properties are broken. For example, +If two processes (A and B) in guest open the file and take locks on different +sections of file, if one of the processes closes the fd, it will close +fd on virtiofsd and all posix locks on file will go away. This means if +process A closes the fd, then locks of process B will go away too. + +Similar other problems exist too. + +This patch set tries to emulate posix locks while using open file +description locks provided on Linux. + +Daemon provides two options (-o posix_lock, -o no_posix_lock) to enable +or disable posix locking in daemon. By default it is enabled. + +There are few issues though. + +- GETLK() returns pid of process holding lock. As we are emulating locks + using OFD, and these locks are not per process and don't return pid + of process, so GETLK() in guest does not reuturn process pid. + +- As of now only F_SETLK is supported and not F_SETLKW. We can't block + the thread in virtiofsd for arbitrary long duration as there is only + one thread serving the queue. That means unlock request will not make + it to daemon and F_SETLKW will block infinitely and bring virtio-fs + to a halt. This is a solvable problem though and will require significant + changes in virtiofsd and kernel. Left as a TODO item for now. + +Signed-off-by: Vivek Goyal +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0e81414c54161296212f6bc8a1c70526c4a9755a) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 3 + + tools/virtiofsd/passthrough_ll.c | 189 +++++++++++++++++++++++++++++++++++++++ + 2 files changed, 192 insertions(+) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5672024..33749bf 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -156,6 +156,9 @@ void fuse_cmdline_help(void) + " allowed (default: 10)\n" + " -o norace disable racy fallback\n" + " default: false\n" ++ " -o posix_lock|no_posix_lock\n" ++ " enable/disable remote posix lock\n" ++ " default: posix_lock\n" + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 05b5f89..9414935 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -67,6 +67,12 @@ + #include "passthrough_helpers.h" + #include "seccomp.h" + ++/* Keep track of inode posix locks for each owner. */ ++struct lo_inode_plock { ++ uint64_t lock_owner; ++ int fd; /* fd for OFD locks */ ++}; ++ + struct lo_map_elem { + union { + struct lo_inode *inode; +@@ -95,6 +101,8 @@ struct lo_inode { + struct lo_key key; + uint64_t refcount; /* protected by lo->mutex */ + fuse_ino_t fuse_ino; ++ pthread_mutex_t plock_mutex; ++ GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ + }; + + struct lo_cred { +@@ -114,6 +122,7 @@ struct lo_data { + int norace; + int writeback; + int flock; ++ int posix_lock; + int xattr; + char *source; + double timeout; +@@ -137,6 +146,8 @@ static const struct fuse_opt lo_opts[] = { + { "source=%s", offsetof(struct lo_data, source), 0 }, + { "flock", offsetof(struct lo_data, flock), 1 }, + { "no_flock", offsetof(struct lo_data, flock), 0 }, ++ { "posix_lock", offsetof(struct lo_data, posix_lock), 1 }, ++ { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 }, + { "xattr", offsetof(struct lo_data, xattr), 1 }, + { "no_xattr", offsetof(struct lo_data, xattr), 0 }, + { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, +@@ -485,6 +496,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } ++ ++ if (conn->capable & FUSE_CAP_POSIX_LOCKS) { ++ if (lo->posix_lock) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n"); ++ conn->want |= FUSE_CAP_POSIX_LOCKS; ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n"); ++ conn->want &= ~FUSE_CAP_POSIX_LOCKS; ++ } ++ } ++ + if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || + lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); +@@ -772,6 +794,19 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + return p; + } + ++/* value_destroy_func for posix_locks GHashTable */ ++static void posix_locks_value_destroy(gpointer data) ++{ ++ struct lo_inode_plock *plock = data; ++ ++ /* ++ * We had used open() for locks and had only one fd. So ++ * closing this fd should release all OFD locks. ++ */ ++ close(plock->fd); ++ free(plock); ++} ++ + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct fuse_entry_param *e) + { +@@ -825,6 +860,9 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + newfd = -1; + inode->key.ino = e->attr.st_ino; + inode->key.dev = e->attr.st_dev; ++ pthread_mutex_init(&inode->plock_mutex, NULL); ++ inode->posix_locks = g_hash_table_new_full( ++ g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); + + pthread_mutex_lock(&lo->mutex); + inode->fuse_ino = lo_add_inode_mapping(req, inode); +@@ -1160,6 +1198,11 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + if (!inode->refcount) { + lo_map_remove(&lo->ino_map, inode->fuse_ino); + g_hash_table_remove(lo->inodes, &inode->key); ++ if (g_hash_table_size(inode->posix_locks)) { ++ fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n"); ++ } ++ g_hash_table_destroy(inode->posix_locks); ++ pthread_mutex_destroy(&inode->plock_mutex); + pthread_mutex_unlock(&lo->mutex); + close(inode->fd); + free(inode); +@@ -1516,6 +1559,136 @@ out: + } + } + ++/* Should be called with inode->plock_mutex held */ ++static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, ++ struct lo_inode *inode, ++ uint64_t lock_owner, ++ pid_t pid, int *err) ++{ ++ struct lo_inode_plock *plock; ++ char procname[64]; ++ int fd; ++ ++ plock = ++ g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner)); ++ ++ if (plock) { ++ return plock; ++ } ++ ++ plock = malloc(sizeof(struct lo_inode_plock)); ++ if (!plock) { ++ *err = ENOMEM; ++ return NULL; ++ } ++ ++ /* Open another instance of file which can be used for ofd locks. */ ++ sprintf(procname, "%i", inode->fd); ++ ++ /* TODO: What if file is not writable? */ ++ fd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (fd == -1) { ++ *err = errno; ++ free(plock); ++ return NULL; ++ } ++ ++ plock->lock_owner = lock_owner; ++ plock->fd = fd; ++ g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner), ++ plock); ++ return plock; ++} ++ ++static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ struct lo_inode_plock *plock; ++ int ret, saverr = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_getlk(ino=%" PRIu64 ", flags=%d)" ++ " owner=0x%lx, l_type=%d l_start=0x%lx" ++ " l_len=0x%lx\n", ++ ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start, ++ lock->l_len); ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ pthread_mutex_lock(&inode->plock_mutex); ++ plock = ++ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); ++ if (!plock) { ++ pthread_mutex_unlock(&inode->plock_mutex); ++ fuse_reply_err(req, ret); ++ return; ++ } ++ ++ ret = fcntl(plock->fd, F_OFD_GETLK, lock); ++ if (ret == -1) { ++ saverr = errno; ++ } ++ pthread_mutex_unlock(&inode->plock_mutex); ++ ++ if (saverr) { ++ fuse_reply_err(req, saverr); ++ } else { ++ fuse_reply_lock(req, lock); ++ } ++} ++ ++static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock, int sleep) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ struct lo_inode_plock *plock; ++ int ret, saverr = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_setlk(ino=%" PRIu64 ", flags=%d)" ++ " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d" ++ " l_start=0x%lx l_len=0x%lx\n", ++ ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep, ++ lock->l_whence, lock->l_start, lock->l_len); ++ ++ if (sleep) { ++ fuse_reply_err(req, EOPNOTSUPP); ++ return; ++ } ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ pthread_mutex_lock(&inode->plock_mutex); ++ plock = ++ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); ++ ++ if (!plock) { ++ pthread_mutex_unlock(&inode->plock_mutex); ++ fuse_reply_err(req, ret); ++ return; ++ } ++ ++ /* TODO: Is it alright to modify flock? */ ++ lock->l_pid = 0; ++ ret = fcntl(plock->fd, F_OFD_SETLK, lock); ++ if (ret == -1) { ++ saverr = errno; ++ } ++ pthread_mutex_unlock(&inode->plock_mutex); ++ fuse_reply_err(req, saverr); ++} ++ + static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { +@@ -1617,6 +1790,19 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { + int res; + (void)ino; ++ struct lo_inode *inode; ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ /* An fd is going away. Cleanup associated posix locks */ ++ pthread_mutex_lock(&inode->plock_mutex); ++ g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner)); ++ pthread_mutex_unlock(&inode->plock_mutex); ++ + res = close(dup(lo_fi_fd(req, fi))); + fuse_reply_err(req, res == -1 ? errno : 0); + } +@@ -2080,6 +2266,8 @@ static struct fuse_lowlevel_ops lo_oper = { + .releasedir = lo_releasedir, + .fsyncdir = lo_fsyncdir, + .create = lo_create, ++ .getlk = lo_getlk, ++ .setlk = lo_setlk, + .open = lo_open, + .release = lo_release, + .flush = lo_flush, +@@ -2434,6 +2622,7 @@ int main(int argc, char *argv[]) + struct lo_data lo = { + .debug = 0, + .writeback = 0, ++ .posix_lock = 1, + .proc_self_fd = -1, + }; + struct lo_map_elem *root_elem; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Trim-down-imported-files.patch b/kvm-virtiofsd-Trim-down-imported-files.patch new file mode 100755 index 0000000..f3f1e85 --- /dev/null +++ b/kvm-virtiofsd-Trim-down-imported-files.patch @@ -0,0 +1,1582 @@ +From 9d3788b1c2fa5cb4f14e292232a05c6a5217802d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:44 +0100 +Subject: [PATCH 013/116] virtiofsd: Trim down imported files +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-10-dgilbert@redhat.com> +Patchwork-id: 93463 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 009/112] virtiofsd: Trim down imported files +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +There's a lot of the original fuse code we don't need; trim them down. + +Signed-off-by: Dr. David Alan Gilbert +with additional trimming by: +Signed-off-by: Misono Tomohiro +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Xiao Yang +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a3e23f325439a290c504d6bbc48c2e742149ecab) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 71 +--- + tools/virtiofsd/fuse.h | 46 --- + tools/virtiofsd/fuse_common.h | 32 -- + tools/virtiofsd/fuse_i.h | 41 --- + tools/virtiofsd/fuse_log.h | 8 - + tools/virtiofsd/fuse_lowlevel.c | 675 +--------------------------------- + tools/virtiofsd/fuse_lowlevel.h | 28 -- + tools/virtiofsd/fuse_opt.h | 8 - + tools/virtiofsd/helper.c | 143 ------- + tools/virtiofsd/passthrough_helpers.h | 26 -- + tools/virtiofsd/passthrough_ll.c | 1 - + 11 files changed, 8 insertions(+), 1071 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 5ab9b87..aefb7db 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -157,73 +157,6 @@ static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, + return copied; + } + +-#ifdef HAVE_SPLICE +-static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) +-{ +- int splice_flags = 0; +- off_t *srcpos = NULL; +- off_t *dstpos = NULL; +- off_t srcpos_val; +- off_t dstpos_val; +- ssize_t res; +- size_t copied = 0; +- +- if (flags & FUSE_BUF_SPLICE_MOVE) +- splice_flags |= SPLICE_F_MOVE; +- if (flags & FUSE_BUF_SPLICE_NONBLOCK) +- splice_flags |= SPLICE_F_NONBLOCK; +- +- if (src->flags & FUSE_BUF_FD_SEEK) { +- srcpos_val = src->pos + src_off; +- srcpos = &srcpos_val; +- } +- if (dst->flags & FUSE_BUF_FD_SEEK) { +- dstpos_val = dst->pos + dst_off; +- dstpos = &dstpos_val; +- } +- +- while (len) { +- res = splice(src->fd, srcpos, dst->fd, dstpos, len, +- splice_flags); +- if (res == -1) { +- if (copied) +- break; +- +- if (errno != EINVAL || (flags & FUSE_BUF_FORCE_SPLICE)) +- return -errno; +- +- /* Maybe splice is not supported for this combination */ +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, +- len); +- } +- if (res == 0) +- break; +- +- copied += res; +- if (!(src->flags & FUSE_BUF_FD_RETRY) && +- !(dst->flags & FUSE_BUF_FD_RETRY)) { +- break; +- } +- +- len -= res; +- } +- +- return copied; +-} +-#else +-static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) +-{ +- (void) flags; +- +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); +-} +-#endif +- +- + static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len, enum fuse_buf_copy_flags flags) +@@ -247,10 +180,8 @@ static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + return fuse_buf_write(dst, dst_off, src, src_off, len); + } else if (!dst_is_fd) { + return fuse_buf_read(dst, dst_off, src, src_off, len); +- } else if (flags & FUSE_BUF_NO_SPLICE) { +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); + } else { +- return fuse_buf_splice(dst, dst_off, src, src_off, len, flags); ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); + } + } + +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +index 883f6e5..3202fba 100644 +--- a/tools/virtiofsd/fuse.h ++++ b/tools/virtiofsd/fuse.h +@@ -25,10 +25,6 @@ + #include + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /* ----------------------------------------------------------- * + * Basic FUSE API * + * ----------------------------------------------------------- */ +@@ -979,44 +975,6 @@ int fuse_loop(struct fuse *f); + void fuse_exit(struct fuse *f); + + /** +- * FUSE event loop with multiple threads +- * +- * Requests from the kernel are processed, and the appropriate +- * operations are called. Request are processed in parallel by +- * distributing them between multiple threads. +- * +- * For a description of the return value and the conditions when the +- * event loop exits, refer to the documentation of +- * fuse_session_loop(). +- * +- * Note: using fuse_loop() instead of fuse_loop_mt() means you are running in +- * single-threaded mode, and that you will not have to worry about reentrancy, +- * though you will have to worry about recursive lookups. In single-threaded +- * mode, FUSE will wait for one callback to return before calling another. +- * +- * Enabling multiple threads, by using fuse_loop_mt(), will cause FUSE to make +- * multiple simultaneous calls into the various callback functions given by your +- * fuse_operations record. +- * +- * If you are using multiple threads, you can enjoy all the parallel execution +- * and interactive response benefits of threads, and you get to enjoy all the +- * benefits of race conditions and locking bugs, too. Ensure that any code used +- * in the callback function of fuse_operations is also thread-safe. +- * +- * @param f the FUSE handle +- * @param config loop configuration +- * @return see fuse_session_loop() +- * +- * See also: fuse_loop() +- */ +-#if FUSE_USE_VERSION < 32 +-int fuse_loop_mt_31(struct fuse *f, int clone_fd); +-#define fuse_loop_mt(f, clone_fd) fuse_loop_mt_31(f, clone_fd) +-#else +-int fuse_loop_mt(struct fuse *f, struct fuse_loop_config *config); +-#endif +- +-/** + * Get the current context + * + * The context is only valid for the duration of a filesystem +@@ -1268,8 +1226,4 @@ struct fuse_session *fuse_get_session(struct fuse *f); + */ + int fuse_open_channel(const char *mountpoint, const char *options); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_H_ */ +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index 2d686b2..bf8f8cc 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -28,10 +28,6 @@ + #define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) + #define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /** + * Information about an open file. + * +@@ -100,30 +96,6 @@ struct fuse_file_info { + uint32_t poll_events; + }; + +-/** +- * Configuration parameters passed to fuse_session_loop_mt() and +- * fuse_loop_mt(). +- */ +-struct fuse_loop_config { +- /** +- * whether to use separate device fds for each thread +- * (may increase performance) +- */ +- int clone_fd; +- +- /** +- * The maximum number of available worker threads before they +- * start to get deleted when they become idle. If not +- * specified, the default is 10. +- * +- * Adjusting this has performance implications; a very small number +- * of threads in the pool will cause a lot of thread creation and +- * deletion overhead and performance may suffer. When set to 0, a new +- * thread will be created to service every operation. +- */ +- unsigned int max_idle_threads; +-}; +- + /************************************************************************** + * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' * + **************************************************************************/ +@@ -802,10 +774,6 @@ void fuse_remove_signal_handlers(struct fuse_session *se); + # error only API version 30 or greater is supported + #endif + +-#ifdef __cplusplus +-} +-#endif +- + + /* + * This interface uses 64 bit off_t. +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index d38b630..b39522e 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -9,8 +9,6 @@ + #include "fuse.h" + #include "fuse_lowlevel.h" + +-struct mount_opts; +- + struct fuse_req { + struct fuse_session *se; + uint64_t unique; +@@ -45,7 +43,6 @@ struct fuse_session { + char *mountpoint; + volatile int exited; + int fd; +- struct mount_opts *mo; + int debug; + int deny_others; + struct fuse_lowlevel_ops op; +@@ -58,7 +55,6 @@ struct fuse_session { + struct fuse_req interrupts; + pthread_mutex_t lock; + int got_destroy; +- pthread_key_t pipe_key; + int broken_splice_nonblock; + uint64_t notify_ctr; + struct fuse_notify_req notify_list; +@@ -87,53 +83,16 @@ struct fuse_module { + int ctr; + }; + +-/* ----------------------------------------------------------- * +- * Channel interface (when using -o clone_fd) * +- * ----------------------------------------------------------- */ +- +-/** +- * Obtain counted reference to the channel +- * +- * @param ch the channel +- * @return the channel +- */ +-struct fuse_chan *fuse_chan_get(struct fuse_chan *ch); +- +-/** +- * Drop counted reference to a channel +- * +- * @param ch the channel +- */ +-void fuse_chan_put(struct fuse_chan *ch); +- +-struct mount_opts *parse_mount_opts(struct fuse_args *args); +-void destroy_mount_opts(struct mount_opts *mo); +-void fuse_mount_version(void); +-unsigned get_max_read(struct mount_opts *o); +-void fuse_kern_unmount(const char *mountpoint, int fd); +-int fuse_kern_mount(const char *mountpoint, struct mount_opts *mo); +- + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count); + void fuse_free_req(fuse_req_t req); + +-void cuse_lowlevel_init(fuse_req_t req, fuse_ino_t nodeide, const void *inarg); +- +-int fuse_start_thread(pthread_t *thread_id, void *(*func)(void *), void *arg); +- +-int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, +- struct fuse_chan *ch); + void fuse_session_process_buf_int(struct fuse_session *se, + const struct fuse_buf *buf, struct fuse_chan *ch); + +-struct fuse *fuse_new_31(struct fuse_args *args, const struct fuse_operations *op, +- size_t op_size, void *private_data); +-int fuse_loop_mt_32(struct fuse *f, struct fuse_loop_config *config); +-int fuse_session_loop_mt_32(struct fuse_session *se, struct fuse_loop_config *config); + + #define FUSE_MAX_MAX_PAGES 256 + #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + + /* room needed in buffer to accommodate header */ + #define FUSE_BUFFER_HEADER_SIZE 0x1000 +- +diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h +index 5e112e0..0af700d 100644 +--- a/tools/virtiofsd/fuse_log.h ++++ b/tools/virtiofsd/fuse_log.h +@@ -16,10 +16,6 @@ + + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /** + * Log severity level + * +@@ -75,8 +71,4 @@ void fuse_set_log_func(fuse_log_func_t func); + */ + void fuse_log(enum fuse_log_level level, const char *fmt, ...); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_LOG_H_ */ +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index f2d7038..e6fa247 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -16,7 +16,6 @@ + #include "fuse_kernel.h" + #include "fuse_opt.h" + #include "fuse_misc.h" +-#include "mount_util.h" + + #include + #include +@@ -28,12 +27,6 @@ + #include + #include + +-#ifndef F_LINUX_SPECIFIC_BASE +-#define F_LINUX_SPECIFIC_BASE 1024 +-#endif +-#ifndef F_SETPIPE_SZ +-#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +-#endif + + + #define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) +@@ -137,7 +130,6 @@ void fuse_free_req(fuse_req_t req) + req->u.ni.data = NULL; + list_del_req(req); + ctr = --req->ctr; +- fuse_chan_put(req->ch); + req->ch = NULL; + pthread_mutex_unlock(&se->lock); + if (!ctr) +@@ -184,19 +176,7 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + } + } + +- ssize_t res = writev(ch ? ch->fd : se->fd, +- iov, count); +- int err = errno; +- +- if (res == -1) { +- assert(se != NULL); +- +- /* ENOENT means the operation was interrupted */ +- if (!fuse_session_exited(se) && err != ENOENT) +- perror("fuse: writing device"); +- return -err; +- } +- ++ abort(); /* virtio should have taken it before here */ + return 0; + } + +@@ -480,10 +460,6 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + struct fuse_bufvec *buf, + size_t len) + { +- struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); +- void *mbuf; +- int res; +- + /* Optimize common case */ + if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && + !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { +@@ -496,350 +472,10 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + return fuse_send_msg(se, ch, iov, iov_count); + } + +- res = posix_memalign(&mbuf, pagesize, len); +- if (res != 0) +- return res; +- +- mem_buf.buf[0].mem = mbuf; +- res = fuse_buf_copy(&mem_buf, buf, 0); +- if (res < 0) { +- free(mbuf); +- return -res; +- } +- len = res; +- +- iov[iov_count].iov_base = mbuf; +- iov[iov_count].iov_len = len; +- iov_count++; +- res = fuse_send_msg(se, ch, iov, iov_count); +- free(mbuf); +- +- return res; +-} +- +-struct fuse_ll_pipe { +- size_t size; +- int can_grow; +- int pipe[2]; +-}; +- +-static void fuse_ll_pipe_free(struct fuse_ll_pipe *llp) +-{ +- close(llp->pipe[0]); +- close(llp->pipe[1]); +- free(llp); +-} +- +-#ifdef HAVE_SPLICE +-#if !defined(HAVE_PIPE2) || !defined(O_CLOEXEC) +-static int fuse_pipe(int fds[2]) +-{ +- int rv = pipe(fds); +- +- if (rv == -1) +- return rv; +- +- if (fcntl(fds[0], F_SETFL, O_NONBLOCK) == -1 || +- fcntl(fds[1], F_SETFL, O_NONBLOCK) == -1 || +- fcntl(fds[0], F_SETFD, FD_CLOEXEC) == -1 || +- fcntl(fds[1], F_SETFD, FD_CLOEXEC) == -1) { +- close(fds[0]); +- close(fds[1]); +- rv = -1; +- } +- return rv; +-} +-#else +-static int fuse_pipe(int fds[2]) +-{ +- return pipe2(fds, O_CLOEXEC | O_NONBLOCK); +-} +-#endif +- +-static struct fuse_ll_pipe *fuse_ll_get_pipe(struct fuse_session *se) +-{ +- struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); +- if (llp == NULL) { +- int res; +- +- llp = malloc(sizeof(struct fuse_ll_pipe)); +- if (llp == NULL) +- return NULL; +- +- res = fuse_pipe(llp->pipe); +- if (res == -1) { +- free(llp); +- return NULL; +- } +- +- /* +- *the default size is 16 pages on linux +- */ +- llp->size = pagesize * 16; +- llp->can_grow = 1; +- +- pthread_setspecific(se->pipe_key, llp); +- } +- +- return llp; +-} +-#endif +- +-static void fuse_ll_clear_pipe(struct fuse_session *se) +-{ +- struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); +- if (llp) { +- pthread_setspecific(se->pipe_key, NULL); +- fuse_ll_pipe_free(llp); +- } +-} +- +-#if defined(HAVE_SPLICE) && defined(HAVE_VMSPLICE) +-static int read_back(int fd, char *buf, size_t len) +-{ +- int res; +- +- res = read(fd, buf, len); +- if (res == -1) { +- fuse_log(FUSE_LOG_ERR, "fuse: internal error: failed to read back from pipe: %s\n", strerror(errno)); +- return -EIO; +- } +- if (res != len) { +- fuse_log(FUSE_LOG_ERR, "fuse: internal error: short read back from pipe: %i from %zi\n", res, len); +- return -EIO; +- } ++ abort(); /* Will have taken vhost path */ + return 0; + } + +-static int grow_pipe_to_max(int pipefd) +-{ +- int max; +- int res; +- int maxfd; +- char buf[32]; +- +- maxfd = open("/proc/sys/fs/pipe-max-size", O_RDONLY); +- if (maxfd < 0) +- return -errno; +- +- res = read(maxfd, buf, sizeof(buf) - 1); +- if (res < 0) { +- int saved_errno; +- +- saved_errno = errno; +- close(maxfd); +- return -saved_errno; +- } +- close(maxfd); +- buf[res] = '\0'; +- +- max = atoi(buf); +- res = fcntl(pipefd, F_SETPIPE_SZ, max); +- if (res < 0) +- return -errno; +- return max; +-} +- +-static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, +- struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, unsigned int flags) +-{ +- int res; +- size_t len = fuse_buf_size(buf); +- struct fuse_out_header *out = iov[0].iov_base; +- struct fuse_ll_pipe *llp; +- int splice_flags; +- size_t pipesize; +- size_t total_fd_size; +- size_t idx; +- size_t headerlen; +- struct fuse_bufvec pipe_buf = FUSE_BUFVEC_INIT(len); +- +- if (se->broken_splice_nonblock) +- goto fallback; +- +- if (flags & FUSE_BUF_NO_SPLICE) +- goto fallback; +- +- total_fd_size = 0; +- for (idx = buf->idx; idx < buf->count; idx++) { +- if (buf->buf[idx].flags & FUSE_BUF_IS_FD) { +- total_fd_size = buf->buf[idx].size; +- if (idx == buf->idx) +- total_fd_size -= buf->off; +- } +- } +- if (total_fd_size < 2 * pagesize) +- goto fallback; +- +- if (se->conn.proto_minor < 14 || +- !(se->conn.want & FUSE_CAP_SPLICE_WRITE)) +- goto fallback; +- +- llp = fuse_ll_get_pipe(se); +- if (llp == NULL) +- goto fallback; +- +- +- headerlen = iov_length(iov, iov_count); +- +- out->len = headerlen + len; +- +- /* +- * Heuristic for the required pipe size, does not work if the +- * source contains less than page size fragments +- */ +- pipesize = pagesize * (iov_count + buf->count + 1) + out->len; +- +- if (llp->size < pipesize) { +- if (llp->can_grow) { +- res = fcntl(llp->pipe[0], F_SETPIPE_SZ, pipesize); +- if (res == -1) { +- res = grow_pipe_to_max(llp->pipe[0]); +- if (res > 0) +- llp->size = res; +- llp->can_grow = 0; +- goto fallback; +- } +- llp->size = res; +- } +- if (llp->size < pipesize) +- goto fallback; +- } +- +- +- res = vmsplice(llp->pipe[1], iov, iov_count, SPLICE_F_NONBLOCK); +- if (res == -1) +- goto fallback; +- +- if (res != headerlen) { +- res = -EIO; +- fuse_log(FUSE_LOG_ERR, "fuse: short vmsplice to pipe: %u/%zu\n", res, +- headerlen); +- goto clear_pipe; +- } +- +- pipe_buf.buf[0].flags = FUSE_BUF_IS_FD; +- pipe_buf.buf[0].fd = llp->pipe[1]; +- +- res = fuse_buf_copy(&pipe_buf, buf, +- FUSE_BUF_FORCE_SPLICE | FUSE_BUF_SPLICE_NONBLOCK); +- if (res < 0) { +- if (res == -EAGAIN || res == -EINVAL) { +- /* +- * Should only get EAGAIN on kernels with +- * broken SPLICE_F_NONBLOCK support (<= +- * 2.6.35) where this error or a short read is +- * returned even if the pipe itself is not +- * full +- * +- * EINVAL might mean that splice can't handle +- * this combination of input and output. +- */ +- if (res == -EAGAIN) +- se->broken_splice_nonblock = 1; +- +- pthread_setspecific(se->pipe_key, NULL); +- fuse_ll_pipe_free(llp); +- goto fallback; +- } +- res = -res; +- goto clear_pipe; +- } +- +- if (res != 0 && res < len) { +- struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); +- void *mbuf; +- size_t now_len = res; +- /* +- * For regular files a short count is either +- * 1) due to EOF, or +- * 2) because of broken SPLICE_F_NONBLOCK (see above) +- * +- * For other inputs it's possible that we overflowed +- * the pipe because of small buffer fragments. +- */ +- +- res = posix_memalign(&mbuf, pagesize, len); +- if (res != 0) +- goto clear_pipe; +- +- mem_buf.buf[0].mem = mbuf; +- mem_buf.off = now_len; +- res = fuse_buf_copy(&mem_buf, buf, 0); +- if (res > 0) { +- char *tmpbuf; +- size_t extra_len = res; +- /* +- * Trickiest case: got more data. Need to get +- * back the data from the pipe and then fall +- * back to regular write. +- */ +- tmpbuf = malloc(headerlen); +- if (tmpbuf == NULL) { +- free(mbuf); +- res = ENOMEM; +- goto clear_pipe; +- } +- res = read_back(llp->pipe[0], tmpbuf, headerlen); +- free(tmpbuf); +- if (res != 0) { +- free(mbuf); +- goto clear_pipe; +- } +- res = read_back(llp->pipe[0], mbuf, now_len); +- if (res != 0) { +- free(mbuf); +- goto clear_pipe; +- } +- len = now_len + extra_len; +- iov[iov_count].iov_base = mbuf; +- iov[iov_count].iov_len = len; +- iov_count++; +- res = fuse_send_msg(se, ch, iov, iov_count); +- free(mbuf); +- return res; +- } +- free(mbuf); +- res = now_len; +- } +- len = res; +- out->len = headerlen + len; +- +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, success, outsize: %i (splice)\n", +- (unsigned long long) out->unique, out->len); +- } +- +- splice_flags = 0; +- if ((flags & FUSE_BUF_SPLICE_MOVE) && +- (se->conn.want & FUSE_CAP_SPLICE_MOVE)) +- splice_flags |= SPLICE_F_MOVE; +- +- res = splice(llp->pipe[0], NULL, ch ? ch->fd : se->fd, +- NULL, out->len, splice_flags); +- if (res == -1) { +- res = -errno; +- perror("fuse: splice from pipe"); +- goto clear_pipe; +- } +- if (res != out->len) { +- res = -EIO; +- fuse_log(FUSE_LOG_ERR, "fuse: short splice from pipe: %u/%u\n", +- res, out->len); +- goto clear_pipe; +- } +- return 0; +- +-clear_pipe: +- fuse_ll_clear_pipe(se); +- return res; +- +-fallback: +- return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); +-} +-#else + static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int iov_count, + struct fuse_bufvec *buf, unsigned int flags) +@@ -849,7 +485,6 @@ static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + + return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); + } +-#endif + + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, + enum fuse_buf_copy_flags flags) +@@ -1408,16 +1043,11 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + if (bufv.buf[0].size < arg->size) { + fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); + fuse_reply_err(req, EIO); +- goto out; ++ return; + } + bufv.buf[0].size = arg->size; + + se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); +- +-out: +- /* Need to reset the pipe if ->write_buf() didn't consume all data */ +- if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) +- fuse_ll_clear_pipe(se); + } + + static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) +@@ -2038,17 +1668,6 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + return; + } + +- unsigned max_read_mo = get_max_read(se->mo); +- if (se->conn.max_read != max_read_mo) { +- fuse_log(FUSE_LOG_ERR, "fuse: error: init() and fuse_session_new() " +- "requested different maximum read size (%u vs %u)\n", +- se->conn.max_read, max_read_mo); +- fuse_reply_err(req, EPROTO); +- se->error = -EPROTO; +- fuse_session_exit(se); +- return; +- } +- + if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { + se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; + } +@@ -2364,8 +1983,6 @@ static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, + } + out: + free(rreq); +- if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) +- fuse_ll_clear_pipe(se); + } + + int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +@@ -2496,7 +2113,6 @@ static struct { + [FUSE_RENAME2] = { do_rename2, "RENAME2" }, + [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, + [FUSE_LSEEK] = { do_lseek, "LSEEK" }, +- [CUSE_INIT] = { cuse_lowlevel_init, "CUSE_INIT" }, + }; + + #define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) +@@ -2509,21 +2125,6 @@ static const char *opname(enum fuse_opcode opcode) + return fuse_ll_ops[opcode].name; + } + +-static int fuse_ll_copy_from_pipe(struct fuse_bufvec *dst, +- struct fuse_bufvec *src) +-{ +- ssize_t res = fuse_buf_copy(dst, src, 0); +- if (res < 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", strerror(-res)); +- return res; +- } +- if ((size_t)res < fuse_buf_size(dst)) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); +- return -1; +- } +- return 0; +-} +- + void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf) + { +@@ -2533,36 +2134,12 @@ void fuse_session_process_buf(struct fuse_session *se, + void fuse_session_process_buf_int(struct fuse_session *se, + const struct fuse_buf *buf, struct fuse_chan *ch) + { +- const size_t write_header_size = sizeof(struct fuse_in_header) + +- sizeof(struct fuse_write_in); +- struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; +- struct fuse_bufvec tmpbuf = FUSE_BUFVEC_INIT(write_header_size); + struct fuse_in_header *in; + const void *inarg; + struct fuse_req *req; +- void *mbuf = NULL; + int err; +- int res; +- +- if (buf->flags & FUSE_BUF_IS_FD) { +- if (buf->size < tmpbuf.buf[0].size) +- tmpbuf.buf[0].size = buf->size; + +- mbuf = malloc(tmpbuf.buf[0].size); +- if (mbuf == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate header\n"); +- goto clear_pipe; +- } +- tmpbuf.buf[0].mem = mbuf; +- +- res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); +- if (res < 0) +- goto clear_pipe; +- +- in = mbuf; +- } else { +- in = buf->mem; +- } ++ in = buf->mem; + + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, +@@ -2584,14 +2161,14 @@ void fuse_session_process_buf_int(struct fuse_session *se, + }; + + fuse_send_msg(se, ch, &iov, 1); +- goto clear_pipe; ++ return; + } + + req->unique = in->unique; + req->ctx.uid = in->uid; + req->ctx.gid = in->gid; + req->ctx.pid = in->pid; +- req->ch = ch ? fuse_chan_get(ch) : NULL; ++ req->ch = ch; + + err = EIO; + if (!se->got_init) { +@@ -2627,28 +2204,6 @@ void fuse_session_process_buf_int(struct fuse_session *se, + fuse_reply_err(intr, EAGAIN); + } + +- if ((buf->flags & FUSE_BUF_IS_FD) && write_header_size < buf->size && +- (in->opcode != FUSE_WRITE || !se->op.write_buf) && +- in->opcode != FUSE_NOTIFY_REPLY) { +- void *newmbuf; +- +- err = ENOMEM; +- newmbuf = realloc(mbuf, buf->size); +- if (newmbuf == NULL) +- goto reply_err; +- mbuf = newmbuf; +- +- tmpbuf = FUSE_BUFVEC_INIT(buf->size - write_header_size); +- tmpbuf.buf[0].mem = (char *)mbuf + write_header_size; +- +- res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); +- err = -res; +- if (res < 0) +- goto reply_err; +- +- in = mbuf; +- } +- + inarg = (void *) &in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) + do_write_buf(req, in->nodeid, inarg, buf); +@@ -2657,16 +2212,10 @@ void fuse_session_process_buf_int(struct fuse_session *se, + else + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + +-out_free: +- free(mbuf); + return; + + reply_err: + fuse_reply_err(req, err); +-clear_pipe: +- if (buf->flags & FUSE_BUF_IS_FD) +- fuse_ll_clear_pipe(se); +- goto out_free; + } + + #define LL_OPTION(n,o,v) \ +@@ -2684,7 +2233,6 @@ void fuse_lowlevel_version(void) + { + printf("using FUSE kernel interface version %i.%i\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); +- fuse_mount_version(); + } + + void fuse_lowlevel_help(void) +@@ -2692,204 +2240,29 @@ void fuse_lowlevel_help(void) + /* These are not all options, but the ones that are + potentially of interest to an end-user */ + printf( +-" -o allow_other allow access by all users\n" + " -o allow_root allow access by root\n" +-" -o auto_unmount auto unmount on process termination\n"); ++); + } + + void fuse_session_destroy(struct fuse_session *se) + { +- struct fuse_ll_pipe *llp; +- + if (se->got_init && !se->got_destroy) { + if (se->op.destroy) + se->op.destroy(se->userdata); + } +- llp = pthread_getspecific(se->pipe_key); +- if (llp != NULL) +- fuse_ll_pipe_free(llp); +- pthread_key_delete(se->pipe_key); + pthread_mutex_destroy(&se->lock); + free(se->cuse_data); + if (se->fd != -1) + close(se->fd); +- destroy_mount_opts(se->mo); + free(se); + } + + +-static void fuse_ll_pipe_destructor(void *data) +-{ +- struct fuse_ll_pipe *llp = data; +- fuse_ll_pipe_free(llp); +-} +- +-int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf) +-{ +- return fuse_session_receive_buf_int(se, buf, NULL); +-} +- +-int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, +- struct fuse_chan *ch) +-{ +- int err; +- ssize_t res; +-#ifdef HAVE_SPLICE +- size_t bufsize = se->bufsize; +- struct fuse_ll_pipe *llp; +- struct fuse_buf tmpbuf; +- +- if (se->conn.proto_minor < 14 || !(se->conn.want & FUSE_CAP_SPLICE_READ)) +- goto fallback; +- +- llp = fuse_ll_get_pipe(se); +- if (llp == NULL) +- goto fallback; +- +- if (llp->size < bufsize) { +- if (llp->can_grow) { +- res = fcntl(llp->pipe[0], F_SETPIPE_SZ, bufsize); +- if (res == -1) { +- llp->can_grow = 0; +- res = grow_pipe_to_max(llp->pipe[0]); +- if (res > 0) +- llp->size = res; +- goto fallback; +- } +- llp->size = res; +- } +- if (llp->size < bufsize) +- goto fallback; +- } +- +- res = splice(ch ? ch->fd : se->fd, +- NULL, llp->pipe[1], NULL, bufsize, 0); +- err = errno; +- +- if (fuse_session_exited(se)) +- return 0; +- +- if (res == -1) { +- if (err == ENODEV) { +- /* Filesystem was unmounted, or connection was aborted +- via /sys/fs/fuse/connections */ +- fuse_session_exit(se); +- return 0; +- } +- if (err != EINTR && err != EAGAIN) +- perror("fuse: splice from device"); +- return -err; +- } +- +- if (res < sizeof(struct fuse_in_header)) { +- fuse_log(FUSE_LOG_ERR, "short splice from fuse device\n"); +- return -EIO; +- } +- +- tmpbuf = (struct fuse_buf) { +- .size = res, +- .flags = FUSE_BUF_IS_FD, +- .fd = llp->pipe[0], +- }; +- +- /* +- * Don't bother with zero copy for small requests. +- * fuse_loop_mt() needs to check for FORGET so this more than +- * just an optimization. +- */ +- if (res < sizeof(struct fuse_in_header) + +- sizeof(struct fuse_write_in) + pagesize) { +- struct fuse_bufvec src = { .buf[0] = tmpbuf, .count = 1 }; +- struct fuse_bufvec dst = { .count = 1 }; +- +- if (!buf->mem) { +- buf->mem = malloc(se->bufsize); +- if (!buf->mem) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: failed to allocate read buffer\n"); +- return -ENOMEM; +- } +- } +- buf->size = se->bufsize; +- buf->flags = 0; +- dst.buf[0] = *buf; +- +- res = fuse_buf_copy(&dst, &src, 0); +- if (res < 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", +- strerror(-res)); +- fuse_ll_clear_pipe(se); +- return res; +- } +- if (res < tmpbuf.size) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); +- fuse_ll_clear_pipe(se); +- return -EIO; +- } +- assert(res == tmpbuf.size); +- +- } else { +- /* Don't overwrite buf->mem, as that would cause a leak */ +- buf->fd = tmpbuf.fd; +- buf->flags = tmpbuf.flags; +- } +- buf->size = tmpbuf.size; +- +- return res; +- +-fallback: +-#endif +- if (!buf->mem) { +- buf->mem = malloc(se->bufsize); +- if (!buf->mem) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: failed to allocate read buffer\n"); +- return -ENOMEM; +- } +- } +- +-restart: +- res = read(ch ? ch->fd : se->fd, buf->mem, se->bufsize); +- err = errno; +- +- if (fuse_session_exited(se)) +- return 0; +- if (res == -1) { +- /* ENOENT means the operation was interrupted, it's safe +- to restart */ +- if (err == ENOENT) +- goto restart; +- +- if (err == ENODEV) { +- /* Filesystem was unmounted, or connection was aborted +- via /sys/fs/fuse/connections */ +- fuse_session_exit(se); +- return 0; +- } +- /* Errors occurring during normal operation: EINTR (read +- interrupted), EAGAIN (nonblocking I/O), ENODEV (filesystem +- umounted) */ +- if (err != EINTR && err != EAGAIN) +- perror("fuse: reading device"); +- return -err; +- } +- if ((size_t) res < sizeof(struct fuse_in_header)) { +- fuse_log(FUSE_LOG_ERR, "short read on fuse device\n"); +- return -EIO; +- } +- +- buf->size = res; +- +- return res; +-} +- + struct fuse_session *fuse_session_new(struct fuse_args *args, + const struct fuse_lowlevel_ops *op, + size_t op_size, void *userdata) + { +- int err; + struct fuse_session *se; +- struct mount_opts *mo; + + if (sizeof(struct fuse_lowlevel_ops) < op_size) { + fuse_log(FUSE_LOG_ERR, "fuse: warning: library too old, some operations may not work\n"); +@@ -2913,20 +2286,6 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + /* Parse options */ + if(fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) + goto out2; +- if(se->deny_others) { +- /* Allowing access only by root is done by instructing +- * kernel to allow access by everyone, and then restricting +- * access to root and mountpoint owner in libfuse. +- */ +- // We may be adding the option a second time, but +- // that doesn't hurt. +- if(fuse_opt_add_arg(args, "-oallow_other") == -1) +- goto out2; +- } +- mo = parse_mount_opts(args); +- if (mo == NULL) +- goto out3; +- + if(args->argc == 1 && + args->argv[0][0] == '-') { + fuse_log(FUSE_LOG_ERR, "fuse: warning: argv[0] looks like an option, but " +@@ -2940,9 +2299,6 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out4; + } + +- if (se->debug) +- fuse_log(FUSE_LOG_DEBUG, "FUSE library version: %s\n", PACKAGE_VERSION); +- + se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + + FUSE_BUFFER_HEADER_SIZE; + +@@ -2952,26 +2308,14 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + se->notify_ctr = 1; + fuse_mutex_init(&se->lock); + +- err = pthread_key_create(&se->pipe_key, fuse_ll_pipe_destructor); +- if (err) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to create thread specific key: %s\n", +- strerror(err)); +- goto out5; +- } +- + memcpy(&se->op, op, op_size); + se->owner = getuid(); + se->userdata = userdata; + +- se->mo = mo; + return se; + +-out5: +- pthread_mutex_destroy(&se->lock); + out4: + fuse_opt_free_args(args); +-out3: +- free(mo); + out2: + free(se); + out1: +@@ -3035,11 +2379,6 @@ int fuse_session_fd(struct fuse_session *se) + + void fuse_session_unmount(struct fuse_session *se) + { +- if (se->mountpoint != NULL) { +- fuse_kern_unmount(se->mountpoint, se->fd); +- free(se->mountpoint); +- se->mountpoint = NULL; +- } + } + + #ifdef linux +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 18c6363..6b1adfc 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -31,10 +31,6 @@ + #include + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /* ----------------------------------------------------------- * + * Miscellaneous definitions * + * ----------------------------------------------------------- */ +@@ -1863,14 +1859,12 @@ void fuse_cmdline_help(void); + * ----------------------------------------------------------- */ + + struct fuse_cmdline_opts { +- int singlethread; + int foreground; + int debug; + int nodefault_subtype; + char *mountpoint; + int show_version; + int show_help; +- int clone_fd; + unsigned int max_idle_threads; + }; + +@@ -1962,24 +1956,6 @@ int fuse_session_mount(struct fuse_session *se, const char *mountpoint); + int fuse_session_loop(struct fuse_session *se); + + /** +- * Enter a multi-threaded event loop. +- * +- * For a description of the return value and the conditions when the +- * event loop exits, refer to the documentation of +- * fuse_session_loop(). +- * +- * @param se the session +- * @param config session loop configuration +- * @return see fuse_session_loop() +- */ +-#if FUSE_USE_VERSION < 32 +-int fuse_session_loop_mt_31(struct fuse_session *se, int clone_fd); +-#define fuse_session_loop_mt(se, clone_fd) fuse_session_loop_mt_31(se, clone_fd) +-#else +-int fuse_session_loop_mt(struct fuse_session *se, struct fuse_loop_config *config); +-#endif +- +-/** + * Flag a session as terminated. + * + * This function is invoked by the POSIX signal handlers, when +@@ -2082,8 +2058,4 @@ void fuse_session_process_buf(struct fuse_session *se, + */ + int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_LOWLEVEL_H_ */ +diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h +index d8573e7..6910255 100644 +--- a/tools/virtiofsd/fuse_opt.h ++++ b/tools/virtiofsd/fuse_opt.h +@@ -14,10 +14,6 @@ + * This file defines the option parsing interface of FUSE + */ + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /** + * Option description + * +@@ -264,8 +260,4 @@ void fuse_opt_free_args(struct fuse_args *args); + */ + int fuse_opt_match(const struct fuse_opt opts[], const char *opt); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_OPT_H_ */ +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 64ff7ad..5a2e64c 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -41,14 +41,10 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), + FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("-f", foreground), +- FUSE_HELPER_OPT("-s", singlethread), + FUSE_HELPER_OPT("fsname=", nodefault_subtype), + FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), +-#ifndef __FreeBSD__ + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), +-#endif +- FUSE_HELPER_OPT("clone_fd", clone_fd), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_OPT_END + }; +@@ -132,9 +128,6 @@ void fuse_cmdline_help(void) + " -V --version print version\n" + " -d -o debug enable debug output (implies -f)\n" + " -f foreground operation\n" +- " -s disable multi-threaded operation\n" +- " -o clone_fd use separate fuse device fd for each thread\n" +- " (may improve performance)\n" + " -o max_idle_threads the maximum number of idle worker threads\n" + " allowed (default: 10)\n"); + } +@@ -171,34 +164,6 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, + } + } + +-/* Under FreeBSD, there is no subtype option so this +- function actually sets the fsname */ +-static int add_default_subtype(const char *progname, struct fuse_args *args) +-{ +- int res; +- char *subtype_opt; +- +- const char *basename = strrchr(progname, '/'); +- if (basename == NULL) +- basename = progname; +- else if (basename[1] != '\0') +- basename++; +- +- subtype_opt = (char *) malloc(strlen(basename) + 64); +- if (subtype_opt == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); +- return -1; +- } +-#ifdef __FreeBSD__ +- sprintf(subtype_opt, "-ofsname=%s", basename); +-#else +- sprintf(subtype_opt, "-osubtype=%s", basename); +-#endif +- res = fuse_opt_add_arg(args, subtype_opt); +- free(subtype_opt); +- return res; +-} +- + int fuse_parse_cmdline(struct fuse_args *args, + struct fuse_cmdline_opts *opts) + { +@@ -210,14 +175,6 @@ int fuse_parse_cmdline(struct fuse_args *args, + fuse_helper_opt_proc) == -1) + return -1; + +- /* *Linux*: if neither -o subtype nor -o fsname are specified, +- set subtype to program's basename. +- *FreeBSD*: if fsname is not specified, set to program's +- basename. */ +- if (!opts->nodefault_subtype) +- if (add_default_subtype(args->argv[0], args) == -1) +- return -1; +- + return 0; + } + +@@ -276,88 +233,6 @@ int fuse_daemonize(int foreground) + return 0; + } + +-int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, +- size_t op_size, void *user_data) +-{ +- struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +- struct fuse *fuse; +- struct fuse_cmdline_opts opts; +- int res; +- +- if (fuse_parse_cmdline(&args, &opts) != 0) +- return 1; +- +- if (opts.show_version) { +- printf("FUSE library version %s\n", PACKAGE_VERSION); +- fuse_lowlevel_version(); +- res = 0; +- goto out1; +- } +- +- if (opts.show_help) { +- if(args.argv[0][0] != '\0') +- printf("usage: %s [options] \n\n", +- args.argv[0]); +- printf("FUSE options:\n"); +- fuse_cmdline_help(); +- fuse_lib_help(&args); +- res = 0; +- goto out1; +- } +- +- if (!opts.show_help && +- !opts.mountpoint) { +- fuse_log(FUSE_LOG_ERR, "error: no mountpoint specified\n"); +- res = 2; +- goto out1; +- } +- +- +- fuse = fuse_new_31(&args, op, op_size, user_data); +- if (fuse == NULL) { +- res = 3; +- goto out1; +- } +- +- if (fuse_mount(fuse,opts.mountpoint) != 0) { +- res = 4; +- goto out2; +- } +- +- if (fuse_daemonize(opts.foreground) != 0) { +- res = 5; +- goto out3; +- } +- +- struct fuse_session *se = fuse_get_session(fuse); +- if (fuse_set_signal_handlers(se) != 0) { +- res = 6; +- goto out3; +- } +- +- if (opts.singlethread) +- res = fuse_loop(fuse); +- else { +- struct fuse_loop_config loop_config; +- loop_config.clone_fd = opts.clone_fd; +- loop_config.max_idle_threads = opts.max_idle_threads; +- res = fuse_loop_mt_32(fuse, &loop_config); +- } +- if (res) +- res = 7; +- +- fuse_remove_signal_handlers(se); +-out3: +- fuse_unmount(fuse); +-out2: +- fuse_destroy(fuse); +-out1: +- free(opts.mountpoint); +- fuse_opt_free_args(&args); +- return res; +-} +- +- + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, + struct fuse_conn_info *conn) + { +@@ -420,21 +295,3 @@ struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args) + } + return opts; + } +- +-int fuse_open_channel(const char *mountpoint, const char* options) +-{ +- struct mount_opts *opts = NULL; +- int fd = -1; +- const char *argv[] = { "", "-o", options }; +- int argc = sizeof(argv) / sizeof(argv[0]); +- struct fuse_args args = FUSE_ARGS_INIT(argc, (char**) argv); +- +- opts = parse_mount_opts(&args); +- if (opts == NULL) +- return -1; +- +- fd = fuse_kern_mount(mountpoint, opts); +- destroy_mount_opts(opts); +- +- return fd; +-} +diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h +index 6b77c33..7c5f561 100644 +--- a/tools/virtiofsd/passthrough_helpers.h ++++ b/tools/virtiofsd/passthrough_helpers.h +@@ -42,32 +42,6 @@ static int mknod_wrapper(int dirfd, const char *path, const char *link, + res = symlinkat(link, dirfd, path); + } else if (S_ISFIFO(mode)) { + res = mkfifoat(dirfd, path, mode); +-#ifdef __FreeBSD__ +- } else if (S_ISSOCK(mode)) { +- struct sockaddr_un su; +- int fd; +- +- if (strlen(path) >= sizeof(su.sun_path)) { +- errno = ENAMETOOLONG; +- return -1; +- } +- fd = socket(AF_UNIX, SOCK_STREAM, 0); +- if (fd >= 0) { +- /* +- * We must bind the socket to the underlying file +- * system to create the socket file, even though +- * we'll never listen on this socket. +- */ +- su.sun_family = AF_UNIX; +- strncpy(su.sun_path, path, sizeof(su.sun_path)); +- res = bindat(dirfd, fd, (struct sockaddr*)&su, +- sizeof(su)); +- if (res == 0) +- close(fd); +- } else { +- res = -1; +- } +-#endif + } else { + res = mknodat(dirfd, path, mode, rdev); + } +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e1a6056..e5f7115 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1240,7 +1240,6 @@ int main(int argc, char *argv[]) + ret = 0; + goto err_out1; + } else if (opts.show_version) { +- printf("FUSE library version %s\n", fuse_pkgversion()); + fuse_lowlevel_version(); + ret = 0; + goto err_out1; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Trim-out-compatibility-code.patch b/kvm-virtiofsd-Trim-out-compatibility-code.patch new file mode 100755 index 0000000..411af77 --- /dev/null +++ b/kvm-virtiofsd-Trim-out-compatibility-code.patch @@ -0,0 +1,545 @@ +From ff16b837e402de773581f77ca188f8806c0b500f Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:51 +0100 +Subject: [PATCH 020/116] virtiofsd: Trim out compatibility code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-17-dgilbert@redhat.com> +Patchwork-id: 93468 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 016/112] virtiofsd: Trim out compatibility code +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +virtiofsd only supports major=7, minor>=31; trim out a lot of +old compatibility code. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 72c42e2d65510e073cf78fdc924d121c77fa0080) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 330 +++++++++++++++------------------------- + 1 file changed, 119 insertions(+), 211 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 07fb8a6..514d79c 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -387,16 +387,7 @@ static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f) + int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) + { + struct fuse_entry_out arg; +- size_t size = req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ENTRY_OUT_SIZE : +- sizeof(arg); +- +- /* +- * before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant +- * negative entry +- */ +- if (!e->ino && req->se->conn.proto_minor < 4) { +- return fuse_reply_err(req, ENOENT); +- } ++ size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + fill_entry(&arg, e); +@@ -407,9 +398,7 @@ int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + const struct fuse_file_info *f) + { + char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; +- size_t entrysize = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ENTRY_OUT_SIZE : +- sizeof(struct fuse_entry_out); ++ size_t entrysize = sizeof(struct fuse_entry_out); + struct fuse_entry_out *earg = (struct fuse_entry_out *)buf; + struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize); + +@@ -423,8 +412,7 @@ int fuse_reply_attr(fuse_req_t req, const struct stat *attr, + double attr_timeout) + { + struct fuse_attr_out arg; +- size_t size = +- req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); ++ size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + arg.attr_valid = calc_timeout_sec(attr_timeout); +@@ -519,8 +507,7 @@ int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) + int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) + { + struct fuse_statfs_out arg; +- size_t size = +- req->se->conn.proto_minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(arg); ++ size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + convert_statfs(stbuf, &arg.st); +@@ -604,45 +591,31 @@ int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, + iov[count].iov_len = sizeof(arg); + count++; + +- if (req->se->conn.proto_minor < 16) { +- if (in_count) { +- iov[count].iov_base = (void *)in_iov; +- iov[count].iov_len = sizeof(in_iov[0]) * in_count; +- count++; +- } ++ /* Can't handle non-compat 64bit ioctls on 32bit */ ++ if (sizeof(void *) == 4 && req->ioctl_64bit) { ++ res = fuse_reply_err(req, EINVAL); ++ goto out; ++ } + +- if (out_count) { +- iov[count].iov_base = (void *)out_iov; +- iov[count].iov_len = sizeof(out_iov[0]) * out_count; +- count++; ++ if (in_count) { ++ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); ++ if (!in_fiov) { ++ goto enomem; + } +- } else { +- /* Can't handle non-compat 64bit ioctls on 32bit */ +- if (sizeof(void *) == 4 && req->ioctl_64bit) { +- res = fuse_reply_err(req, EINVAL); +- goto out; +- } +- +- if (in_count) { +- in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); +- if (!in_fiov) { +- goto enomem; +- } + +- iov[count].iov_base = (void *)in_fiov; +- iov[count].iov_len = sizeof(in_fiov[0]) * in_count; +- count++; ++ iov[count].iov_base = (void *)in_fiov; ++ iov[count].iov_len = sizeof(in_fiov[0]) * in_count; ++ count++; ++ } ++ if (out_count) { ++ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); ++ if (!out_fiov) { ++ goto enomem; + } +- if (out_count) { +- out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); +- if (!out_fiov) { +- goto enomem; +- } + +- iov[count].iov_base = (void *)out_fiov; +- iov[count].iov_len = sizeof(out_fiov[0]) * out_count; +- count++; +- } ++ iov[count].iov_base = (void *)out_fiov; ++ iov[count].iov_len = sizeof(out_fiov[0]) * out_count; ++ count++; + } + + res = send_reply_iov(req, 0, iov, count); +@@ -784,14 +757,12 @@ static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + struct fuse_file_info *fip = NULL; + struct fuse_file_info fi; + +- if (req->se->conn.proto_minor >= 9) { +- struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; ++ struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; + +- if (arg->getattr_flags & FUSE_GETATTR_FH) { +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fip = &fi; +- } ++ if (arg->getattr_flags & FUSE_GETATTR_FH) { ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fip = &fi; + } + + if (req->se->op.getattr) { +@@ -856,11 +827,7 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + struct fuse_mknod_in *arg = (struct fuse_mknod_in *)inarg; + char *name = PARAM(arg); + +- if (req->se->conn.proto_minor >= 12) { +- req->ctx.umask = arg->umask; +- } else { +- name = (char *)inarg + FUSE_COMPAT_MKNOD_IN_SIZE; +- } ++ req->ctx.umask = arg->umask; + + if (req->se->op.mknod) { + req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); +@@ -873,9 +840,7 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { + struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *)inarg; + +- if (req->se->conn.proto_minor >= 12) { +- req->ctx.umask = arg->umask; +- } ++ req->ctx.umask = arg->umask; + + if (req->se->op.mkdir) { + req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); +@@ -967,11 +932,7 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + +- if (req->se->conn.proto_minor >= 12) { +- req->ctx.umask = arg->umask; +- } else { +- name = (char *)inarg + sizeof(struct fuse_open_in); +- } ++ req->ctx.umask = arg->umask; + + req->se->op.create(req, nodeid, name, arg->mode, &fi); + } else { +@@ -1003,10 +964,8 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 9) { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- } ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; + req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); +@@ -1023,13 +982,9 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + fi.fh = arg->fh; + fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; + +- if (req->se->conn.proto_minor < 9) { +- param = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- param = PARAM(arg); +- } ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ param = PARAM(arg); + + if (req->se->op.write) { + req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); +@@ -1053,21 +1008,14 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + fi.fh = arg->fh; + fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; + +- if (se->conn.proto_minor < 9) { +- bufv.buf[0].mem = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + FUSE_COMPAT_WRITE_IN_SIZE; +- assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- bufv.buf[0].mem = PARAM(arg); +- } +- +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ bufv.buf[0].mem = PARAM(arg); + } ++ ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); + if (bufv.buf[0].size < arg->size) { + fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); + fuse_reply_err(req, EIO); +@@ -1086,9 +1034,7 @@ static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.flush = 1; +- if (req->se->conn.proto_minor >= 7) { +- fi.lock_owner = arg->lock_owner; +- } ++ fi.lock_owner = arg->lock_owner; + + if (req->se->op.flush) { + req->se->op.flush(req, nodeid, &fi); +@@ -1105,10 +1051,8 @@ static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 8) { +- fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; +- fi.lock_owner = arg->lock_owner; +- } ++ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; ++ fi.lock_owner = arg->lock_owner; + if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { + fi.flock_release = 1; + fi.lock_owner = arg->lock_owner; +@@ -1477,8 +1421,7 @@ static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +- if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && +- !(flags & FUSE_IOCTL_32BIT)) { ++ if (sizeof(void *) == 4 && !(flags & FUSE_IOCTL_32BIT)) { + req->ioctl_64bit = 1; + } + +@@ -1603,7 +1546,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + outarg.major = FUSE_KERNEL_VERSION; + outarg.minor = FUSE_KERNEL_MINOR_VERSION; + +- if (arg->major < 7) { ++ if (arg->major < 7 || (arg->major == 7 && arg->minor < 31)) { + fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", + arg->major, arg->minor); + fuse_reply_err(req, EPROTO); +@@ -1616,81 +1559,71 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + return; + } + +- if (arg->minor >= 6) { +- if (arg->max_readahead < se->conn.max_readahead) { +- se->conn.max_readahead = arg->max_readahead; +- } +- if (arg->flags & FUSE_ASYNC_READ) { +- se->conn.capable |= FUSE_CAP_ASYNC_READ; +- } +- if (arg->flags & FUSE_POSIX_LOCKS) { +- se->conn.capable |= FUSE_CAP_POSIX_LOCKS; +- } +- if (arg->flags & FUSE_ATOMIC_O_TRUNC) { +- se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; +- } +- if (arg->flags & FUSE_EXPORT_SUPPORT) { +- se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; +- } +- if (arg->flags & FUSE_DONT_MASK) { +- se->conn.capable |= FUSE_CAP_DONT_MASK; +- } +- if (arg->flags & FUSE_FLOCK_LOCKS) { +- se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; +- } +- if (arg->flags & FUSE_AUTO_INVAL_DATA) { +- se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; +- } +- if (arg->flags & FUSE_DO_READDIRPLUS) { +- se->conn.capable |= FUSE_CAP_READDIRPLUS; +- } +- if (arg->flags & FUSE_READDIRPLUS_AUTO) { +- se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; +- } +- if (arg->flags & FUSE_ASYNC_DIO) { +- se->conn.capable |= FUSE_CAP_ASYNC_DIO; +- } +- if (arg->flags & FUSE_WRITEBACK_CACHE) { +- se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; +- } +- if (arg->flags & FUSE_NO_OPEN_SUPPORT) { +- se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; +- } +- if (arg->flags & FUSE_PARALLEL_DIROPS) { +- se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; +- } +- if (arg->flags & FUSE_POSIX_ACL) { +- se->conn.capable |= FUSE_CAP_POSIX_ACL; +- } +- if (arg->flags & FUSE_HANDLE_KILLPRIV) { +- se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; +- } +- if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { +- se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; +- } +- if (!(arg->flags & FUSE_MAX_PAGES)) { +- size_t max_bufsize = +- FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + +- FUSE_BUFFER_HEADER_SIZE; +- if (bufsize > max_bufsize) { +- bufsize = max_bufsize; +- } ++ if (arg->max_readahead < se->conn.max_readahead) { ++ se->conn.max_readahead = arg->max_readahead; ++ } ++ if (arg->flags & FUSE_ASYNC_READ) { ++ se->conn.capable |= FUSE_CAP_ASYNC_READ; ++ } ++ if (arg->flags & FUSE_POSIX_LOCKS) { ++ se->conn.capable |= FUSE_CAP_POSIX_LOCKS; ++ } ++ if (arg->flags & FUSE_ATOMIC_O_TRUNC) { ++ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; ++ } ++ if (arg->flags & FUSE_EXPORT_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; ++ } ++ if (arg->flags & FUSE_DONT_MASK) { ++ se->conn.capable |= FUSE_CAP_DONT_MASK; ++ } ++ if (arg->flags & FUSE_FLOCK_LOCKS) { ++ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; ++ } ++ if (arg->flags & FUSE_AUTO_INVAL_DATA) { ++ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; ++ } ++ if (arg->flags & FUSE_DO_READDIRPLUS) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS; ++ } ++ if (arg->flags & FUSE_READDIRPLUS_AUTO) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; ++ } ++ if (arg->flags & FUSE_ASYNC_DIO) { ++ se->conn.capable |= FUSE_CAP_ASYNC_DIO; ++ } ++ if (arg->flags & FUSE_WRITEBACK_CACHE) { ++ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (arg->flags & FUSE_NO_OPEN_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; ++ } ++ if (arg->flags & FUSE_PARALLEL_DIROPS) { ++ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; ++ } ++ if (arg->flags & FUSE_POSIX_ACL) { ++ se->conn.capable |= FUSE_CAP_POSIX_ACL; ++ } ++ if (arg->flags & FUSE_HANDLE_KILLPRIV) { ++ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; ++ } ++ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; ++ } ++ if (!(arg->flags & FUSE_MAX_PAGES)) { ++ size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + ++ FUSE_BUFFER_HEADER_SIZE; ++ if (bufsize > max_bufsize) { ++ bufsize = max_bufsize; + } +- } else { +- se->conn.max_readahead = 0; + } +- +- if (se->conn.proto_minor >= 14) { + #ifdef HAVE_SPLICE + #ifdef HAVE_VMSPLICE +- se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; ++ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; + #endif +- se->conn.capable |= FUSE_CAP_SPLICE_READ; ++ se->conn.capable |= FUSE_CAP_SPLICE_READ; + #endif +- } +- if (se->conn.proto_minor >= 18) { +- se->conn.capable |= FUSE_CAP_IOCTL_DIR; +- } ++ se->conn.capable |= FUSE_CAP_IOCTL_DIR; + + /* + * Default settings for modern filesystems. +@@ -1797,24 +1730,20 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + outarg.max_readahead = se->conn.max_readahead; + outarg.max_write = se->conn.max_write; +- if (se->conn.proto_minor >= 13) { +- if (se->conn.max_background >= (1 << 16)) { +- se->conn.max_background = (1 << 16) - 1; +- } +- if (se->conn.congestion_threshold > se->conn.max_background) { +- se->conn.congestion_threshold = se->conn.max_background; +- } +- if (!se->conn.congestion_threshold) { +- se->conn.congestion_threshold = se->conn.max_background * 3 / 4; +- } +- +- outarg.max_background = se->conn.max_background; +- outarg.congestion_threshold = se->conn.congestion_threshold; ++ if (se->conn.max_background >= (1 << 16)) { ++ se->conn.max_background = (1 << 16) - 1; ++ } ++ if (se->conn.congestion_threshold > se->conn.max_background) { ++ se->conn.congestion_threshold = se->conn.max_background; + } +- if (se->conn.proto_minor >= 23) { +- outarg.time_gran = se->conn.time_gran; ++ if (!se->conn.congestion_threshold) { ++ se->conn.congestion_threshold = se->conn.max_background * 3 / 4; + } + ++ outarg.max_background = se->conn.max_background; ++ outarg.congestion_threshold = se->conn.congestion_threshold; ++ outarg.time_gran = se->conn.time_gran; ++ + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, + outarg.minor); +@@ -1828,11 +1757,6 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + outarg.congestion_threshold); + fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); + } +- if (arg->minor < 5) { +- outargsize = FUSE_COMPAT_INIT_OUT_SIZE; +- } else if (arg->minor < 23) { +- outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; +- } + + send_reply_ok(req, &outarg, outargsize); + } +@@ -1896,10 +1820,6 @@ int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { +- return -ENOSYS; +- } +- + outarg.ino = ino; + outarg.off = off; + outarg.len = len; +@@ -1920,10 +1840,6 @@ int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { +- return -ENOSYS; +- } +- + outarg.parent = parent; + outarg.namelen = namelen; + outarg.padding = 0; +@@ -1947,10 +1863,6 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) { +- return -ENOSYS; +- } +- + outarg.parent = parent; + outarg.child = child; + outarg.namelen = namelen; +@@ -1977,10 +1889,6 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { +- return -ENOSYS; +- } +- + out.unique = 0; + out.error = FUSE_NOTIFY_STORE; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Whitelist-fchmod.patch b/kvm-virtiofsd-Whitelist-fchmod.patch new file mode 100755 index 0000000..a4f95d9 --- /dev/null +++ b/kvm-virtiofsd-Whitelist-fchmod.patch @@ -0,0 +1,79 @@ +From 181ed1777c3dd50b1ff9907b0a4199e845af1270 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Fri, 18 Jun 2021 16:21:17 -0400 +Subject: [PATCH 1/4] virtiofsd: Whitelist fchmod +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20210618162117.97775-2-mreitz@redhat.com> +Patchwork-id: 101719 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] virtiofsd: Whitelist fchmod +Bugzilla: 1967914 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Vivek Goyal +RH-Acked-by: Connor Kuehl + +lo_setattr() invokes fchmod() in a rarely used code path, so it should +be whitelisted or virtiofsd will crash with EBADSYS. + +Said code path can be triggered for example as follows: + +On the host, in the shared directory, create a file with the sticky bit +set and a security.capability xattr: +(1) # touch foo +(2) # chmod u+s foo +(3) # setcap '' foo + +Then in the guest let some process truncate that file after it has +dropped all of its capabilities (at least CAP_FSETID): + +int main(int argc, char *argv[]) +{ + capng_setpid(getpid()); + capng_clear(CAPNG_SELECT_BOTH); + capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, 0); + capng_apply(CAPNG_SELECT_BOTH); + + ftruncate(open(argv[1], O_RDWR), 0); +} + +This will cause the guest kernel to drop the sticky bit (i.e. perform a +mode change) as part of the truncate (where FATTR_FH is set), and that +will cause virtiofsd to invoke fchmod() instead of fchmodat(). + +(A similar configuration exists further below with futimens() vs. +utimensat(), but the former is not a syscall but just a wrapper for the +latter, so no further whitelisting is required.) + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1842667 +Reported-by: Qian Cai +Cc: qemu-stable@nongnu.org +Signed-off-by: Max Reitz +Message-Id: <20200608093111.14942-1-mreitz@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Vivek Goyal +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 63659fe74e76f5c5285466f0c5cfbdca65b3688e) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/seccomp.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +index bd9e7b083c..3b1522acdd 100644 +--- a/tools/virtiofsd/seccomp.c ++++ b/tools/virtiofsd/seccomp.c +@@ -42,6 +42,7 @@ static const int syscall_whitelist[] = { + SCMP_SYS(exit_group), + SCMP_SYS(fallocate), + SCMP_SYS(fchdir), ++ SCMP_SYS(fchmod), + SCMP_SYS(fchmodat), + SCMP_SYS(fchownat), + SCMP_SYS(fcntl), +-- +2.27.0 + diff --git a/kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch b/kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch new file mode 100755 index 0000000..a0882d5 --- /dev/null +++ b/kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch @@ -0,0 +1,93 @@ +From e4c8fd1060fb69a093064851ebf66dd82533ec0e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:17 +0100 +Subject: [PATCH 106/116] virtiofsd: add definition of fuse_buf_writev() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-103-dgilbert@redhat.com> +Patchwork-id: 93557 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 102/112] virtiofsd: add definition of fuse_buf_writev() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: piaojun + +Define fuse_buf_writev() which use pwritev and writev to improve io +bandwidth. Especially, the src bufs with 0 size should be skipped as +their mems are not *block_size* aligned which will cause writev failed +in direct io mode. + +Signed-off-by: Jun Piao +Suggested-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9ceaaa15cf21073c2b23058c374f61c30cd39c31) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 38 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 38 insertions(+) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 42a608f..37befeb 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -14,6 +14,7 @@ + #include "fuse_lowlevel.h" + #include + #include ++#include + #include + #include + +@@ -33,6 +34,43 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv) + return size; + } + ++__attribute__((unused)) ++static ssize_t fuse_buf_writev(struct fuse_buf *out_buf, ++ struct fuse_bufvec *in_buf) ++{ ++ ssize_t res, i, j; ++ size_t iovcnt = in_buf->count; ++ struct iovec *iov; ++ int fd = out_buf->fd; ++ ++ iov = calloc(iovcnt, sizeof(struct iovec)); ++ if (!iov) { ++ return -ENOMEM; ++ } ++ ++ for (i = 0, j = 0; i < iovcnt; i++) { ++ /* Skip the buf with 0 size */ ++ if (in_buf->buf[i].size) { ++ iov[j].iov_base = in_buf->buf[i].mem; ++ iov[j].iov_len = in_buf->buf[i].size; ++ j++; ++ } ++ } ++ ++ if (out_buf->flags & FUSE_BUF_FD_SEEK) { ++ res = pwritev(fd, iov, iovcnt, out_buf->pos); ++ } else { ++ res = writev(fd, iov, iovcnt); ++ } ++ ++ if (res == -1) { ++ res = -errno; ++ } ++ ++ free(iov); ++ return res; ++} ++ + static size_t min_size(size_t s1, size_t s2) + { + return s1 < s2 ? s1 : s2; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch b/kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch new file mode 100755 index 0000000..451f12b --- /dev/null +++ b/kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch @@ -0,0 +1,170 @@ +From f91a9bdc171142174110e9ff1716b611f6fb0039 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:07 +0100 +Subject: [PATCH 036/116] virtiofsd: add --fd=FDNUM fd passing option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-33-dgilbert@redhat.com> +Patchwork-id: 93487 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 032/112] virtiofsd: add --fd=FDNUM fd passing option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Although --socket-path=PATH is useful for manual invocations, management +tools typically create the UNIX domain socket themselves and pass it to +the vhost-user device backend. This way QEMU can be launched +immediately with a valid socket. No waiting for the vhost-user device +backend is required when fd passing is used. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit cee8e35d4386e34bf79c3ca2aab7f7b1bb48cf8d) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 16 ++++++++++++---- + tools/virtiofsd/fuse_virtio.c | 31 +++++++++++++++++++++++++------ + 3 files changed, 38 insertions(+), 10 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 1126723..45995f3 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -68,6 +68,7 @@ struct fuse_session { + size_t bufsize; + int error; + char *vu_socket_path; ++ int vu_listen_fd; + int vu_socketfd; + struct fv_VuDev *virtio_dev; + }; +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 4f4684d..95f4db8 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2130,6 +2130,7 @@ static const struct fuse_opt fuse_ll_opts[] = { + LL_OPTION("--debug", debug, 1), + LL_OPTION("allow_root", deny_others, 1), + LL_OPTION("--socket-path=%s", vu_socket_path, 0), ++ LL_OPTION("--fd=%d", vu_listen_fd, 0), + FUSE_OPT_END + }; + +@@ -2147,7 +2148,8 @@ void fuse_lowlevel_help(void) + */ + printf( + " -o allow_root allow access by root\n" +- " --socket-path=PATH path for the vhost-user socket\n"); ++ " --socket-path=PATH path for the vhost-user socket\n" ++ " --fd=FDNUM fd number of vhost-user socket\n"); + } + + void fuse_session_destroy(struct fuse_session *se) +@@ -2191,6 +2193,7 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out1; + } + se->fd = -1; ++ se->vu_listen_fd = -1; + se->conn.max_write = UINT_MAX; + se->conn.max_readahead = UINT_MAX; + +@@ -2212,8 +2215,13 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out4; + } + +- if (!se->vu_socket_path) { +- fprintf(stderr, "fuse: missing -o vhost_user_socket option\n"); ++ if (!se->vu_socket_path && se->vu_listen_fd < 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: missing --socket-path or --fd option\n"); ++ goto out4; ++ } ++ if (se->vu_socket_path && se->vu_listen_fd >= 0) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: --socket-path and --fd cannot be given together\n"); + goto out4; + } + +@@ -2253,7 +2261,7 @@ void fuse_session_unmount(struct fuse_session *se) + + int fuse_lowlevel_is_virtio(struct fuse_session *se) + { +- return se->vu_socket_path != NULL; ++ return !!se->virtio_dev; + } + + #ifdef linux +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 7e2711b..635f877 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -638,18 +638,21 @@ int virtio_loop(struct fuse_session *se) + return 0; + } + +-int virtio_session_mount(struct fuse_session *se) ++static int fv_create_listen_socket(struct fuse_session *se) + { + struct sockaddr_un un; + mode_t old_umask; + ++ /* Nothing to do if fd is already initialized */ ++ if (se->vu_listen_fd >= 0) { ++ return 0; ++ } ++ + if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) { + fuse_log(FUSE_LOG_ERR, "Socket path too long\n"); + return -1; + } + +- se->fd = -1; +- + /* + * Create the Unix socket to communicate with qemu + * based on QEMU's vhost-user-bridge +@@ -682,15 +685,31 @@ int virtio_session_mount(struct fuse_session *se) + return -1; + } + ++ se->vu_listen_fd = listen_sock; ++ return 0; ++} ++ ++int virtio_session_mount(struct fuse_session *se) ++{ ++ int ret; ++ ++ ret = fv_create_listen_socket(se); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ se->fd = -1; ++ + fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n", + __func__); +- int data_sock = accept(listen_sock, NULL, NULL); ++ int data_sock = accept(se->vu_listen_fd, NULL, NULL); + if (data_sock == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n"); +- close(listen_sock); ++ close(se->vu_listen_fd); + return -1; + } +- close(listen_sock); ++ close(se->vu_listen_fd); ++ se->vu_listen_fd = -1; + fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n", + __func__); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-fuse_mbuf_iter-API.patch b/kvm-virtiofsd-add-fuse_mbuf_iter-API.patch new file mode 100755 index 0000000..b874dc9 --- /dev/null +++ b/kvm-virtiofsd-add-fuse_mbuf_iter-API.patch @@ -0,0 +1,134 @@ +From 1b0edd3d0a2ee5c097bcf3501c1dfa937f02e473 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:21 +0100 +Subject: [PATCH 050/116] virtiofsd: add fuse_mbuf_iter API +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-47-dgilbert@redhat.com> +Patchwork-id: 93502 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 046/112] virtiofsd: add fuse_mbuf_iter API +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Introduce an API for consuming bytes from a buffer with size checks. +All FUSE operations will be converted to use this safe API instead of +void *inarg. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit dad157e880416ab3a0e45beaa0e81977516568bc) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 28 +++++++++++++++++++++++++ + tools/virtiofsd/fuse_common.h | 49 ++++++++++++++++++++++++++++++++++++++++++- + 2 files changed, 76 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 772efa9..42a608f 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -267,3 +267,31 @@ ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) + + return copied; + } ++ ++void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len) ++{ ++ void *ptr; ++ ++ if (len > iter->size - iter->pos) { ++ return NULL; ++ } ++ ++ ptr = iter->mem + iter->pos; ++ iter->pos += len; ++ return ptr; ++} ++ ++const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter) ++{ ++ const char *str = iter->mem + iter->pos; ++ size_t remaining = iter->size - iter->pos; ++ size_t i; ++ ++ for (i = 0; i < remaining; i++) { ++ if (str[i] == '\0') { ++ iter->pos += i + 1; ++ return str; ++ } ++ } ++ return NULL; ++} +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index 0cb33ac..f8f6433 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -703,10 +703,57 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv); + */ + ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src); + ++/** ++ * Memory buffer iterator ++ * ++ */ ++struct fuse_mbuf_iter { ++ /** ++ * Data pointer ++ */ ++ void *mem; ++ ++ /** ++ * Total length, in bytes ++ */ ++ size_t size; ++ ++ /** ++ * Offset from start of buffer ++ */ ++ size_t pos; ++}; ++ ++/* Initialize memory buffer iterator from a fuse_buf */ ++#define FUSE_MBUF_ITER_INIT(fbuf) \ ++ ((struct fuse_mbuf_iter){ \ ++ .mem = fbuf->mem, \ ++ .size = fbuf->size, \ ++ .pos = 0, \ ++ }) ++ ++/** ++ * Consume bytes from a memory buffer iterator ++ * ++ * @param iter memory buffer iterator ++ * @param len number of bytes to consume ++ * @return pointer to start of consumed bytes or ++ * NULL if advancing beyond end of buffer ++ */ ++void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len); ++ ++/** ++ * Consume a NUL-terminated string from a memory buffer iterator ++ * ++ * @param iter memory buffer iterator ++ * @return pointer to the string or ++ * NULL if advancing beyond end of buffer or there is no NUL-terminator ++ */ ++const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter); ++ + /* + * Signal handling + */ +- + /** + * Exit session on HUP, TERM and INT signals and ignore PIPE signal + * +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch b/kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch new file mode 100755 index 0000000..bdef115 --- /dev/null +++ b/kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch @@ -0,0 +1,88 @@ +From 7a3c94e10b087c06635ef72aadb1550184dd5c58 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:58 +0100 +Subject: [PATCH 087/116] virtiofsd: add helper for lo_data cleanup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-84-dgilbert@redhat.com> +Patchwork-id: 93538 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 083/112] virtiofsd: add helper for lo_data cleanup +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +This offers an helper function for lo_data's cleanup. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 18a69cbbb6a4caa7c2040c6db4a33b044a32be7e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 056ebe8..e8dc5c7 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2407,6 +2407,26 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b) + return la->ino == lb->ino && la->dev == lb->dev; + } + ++static void fuse_lo_data_cleanup(struct lo_data *lo) ++{ ++ if (lo->inodes) { ++ g_hash_table_destroy(lo->inodes); ++ } ++ lo_map_destroy(&lo->fd_map); ++ lo_map_destroy(&lo->dirp_map); ++ lo_map_destroy(&lo->ino_map); ++ ++ if (lo->proc_self_fd >= 0) { ++ close(lo->proc_self_fd); ++ } ++ ++ if (lo->root.fd >= 0) { ++ close(lo->root.fd); ++ } ++ ++ free(lo->source); ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2554,22 +2574,7 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + +- if (lo.inodes) { +- g_hash_table_destroy(lo.inodes); +- } +- lo_map_destroy(&lo.fd_map); +- lo_map_destroy(&lo.dirp_map); +- lo_map_destroy(&lo.ino_map); +- +- if (lo.proc_self_fd >= 0) { +- close(lo.proc_self_fd); +- } +- +- if (lo.root.fd >= 0) { +- close(lo.root.fd); +- } +- +- free(lo.source); ++ fuse_lo_data_cleanup(&lo); + + return ret ? 1 : 0; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-o-source-PATH-to-help-output.patch b/kvm-virtiofsd-add-o-source-PATH-to-help-output.patch new file mode 100755 index 0000000..5e81663 --- /dev/null +++ b/kvm-virtiofsd-add-o-source-PATH-to-help-output.patch @@ -0,0 +1,46 @@ +From c55995c25f60168e3cb6b5bae1bf9a47813383d0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:55 +0100 +Subject: [PATCH 024/116] virtiofsd: add -o source=PATH to help output +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-21-dgilbert@redhat.com> +Patchwork-id: 93474 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 020/112] virtiofsd: add -o source=PATH to help output +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +The -o source=PATH option will be used by most command-line invocations. +Let's document it! + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 4ff075f72be2f489c8998ae492ec5cdbbbd73e07) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 26ac870..fc9b264 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1319,6 +1319,7 @@ int main(int argc, char *argv[]) + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); ++ printf(" -o source=PATH shared directory tree\n"); + fuse_lowlevel_help(); + ret = 0; + goto err_out1; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-print-capabilities-option.patch b/kvm-virtiofsd-add-print-capabilities-option.patch new file mode 100755 index 0000000..b57e408 --- /dev/null +++ b/kvm-virtiofsd-add-print-capabilities-option.patch @@ -0,0 +1,121 @@ +From 23d81ee7564084f29e32fedaed5196ae1a5a3240 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:10 +0100 +Subject: [PATCH 039/116] virtiofsd: add --print-capabilities option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-36-dgilbert@redhat.com> +Patchwork-id: 93486 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 035/112] virtiofsd: add --print-capabilities option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Add the --print-capabilities option as per vhost-user.rst "Backend +programs conventions". Currently there are no advertised features. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 45018fbb0a73ce66fd3dd87ecd2872b45658add4) +Signed-off-by: Miroslav Rezanina +--- + docs/interop/vhost-user.json | 4 +++- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 2 ++ + tools/virtiofsd/passthrough_ll.c | 12 ++++++++++++ + 4 files changed, 18 insertions(+), 1 deletion(-) + +diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json +index da6aaf5..d4ea1f7 100644 +--- a/docs/interop/vhost-user.json ++++ b/docs/interop/vhost-user.json +@@ -31,6 +31,7 @@ + # @rproc-serial: virtio remoteproc serial link + # @scsi: virtio scsi + # @vsock: virtio vsock transport ++# @fs: virtio fs (since 4.2) + # + # Since: 4.0 + ## +@@ -50,7 +51,8 @@ + 'rpmsg', + 'rproc-serial', + 'scsi', +- 'vsock' ++ 'vsock', ++ 'fs' + ] + } + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index f6b3470..0d61df8 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1794,6 +1794,7 @@ struct fuse_cmdline_opts { + int nodefault_subtype; + int show_version; + int show_help; ++ int print_capabilities; + unsigned int max_idle_threads; + }; + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index a3645fc..b8ec5ac 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -40,6 +40,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("--help", show_help), + FUSE_HELPER_OPT("-V", show_version), + FUSE_HELPER_OPT("--version", show_version), ++ FUSE_HELPER_OPT("--print-capabilities", print_capabilities), + FUSE_HELPER_OPT("-d", debug), + FUSE_HELPER_OPT("debug", debug), + FUSE_HELPER_OPT("-d", foreground), +@@ -135,6 +136,7 @@ void fuse_cmdline_help(void) + { + printf(" -h --help print help\n" + " -V --version print version\n" ++ " --print-capabilities print vhost-user.json\n" + " -d -o debug enable debug output (implies -f)\n" + " -f foreground operation\n" + " --daemonize run in background\n" +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 037c5d7..cd27c09 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1298,6 +1298,14 @@ static struct fuse_lowlevel_ops lo_oper = { + .lseek = lo_lseek, + }; + ++/* Print vhost-user.json backend program capabilities */ ++static void print_capabilities(void) ++{ ++ printf("{\n"); ++ printf(" \"type\": \"fs\"\n"); ++ printf("}\n"); ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -1328,6 +1336,10 @@ int main(int argc, char *argv[]) + fuse_lowlevel_version(); + ret = 0; + goto err_out1; ++ } else if (opts.print_capabilities) { ++ print_capabilities(); ++ ret = 0; ++ goto err_out1; + } + + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch b/kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch new file mode 100755 index 0000000..a6a9cc9 --- /dev/null +++ b/kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch @@ -0,0 +1,164 @@ +From 555ec3463b3dbfd6e08eac7840419d176f113e46 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:55 +0100 +Subject: [PATCH 4/9] virtiofsd: add --rlimit-nofile=NUM option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-3-dgilbert@redhat.com> +Patchwork-id: 96270 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/7] virtiofsd: add --rlimit-nofile=NUM option +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +Make it possible to specify the RLIMIT_NOFILE on the command-line. +Users running multiple virtiofsd processes should allocate a certain +number to each process so that the system-wide limit can never be +exhausted. + +When this option is set to 0 the rlimit is left at its current value. +This is useful when a management tool wants to configure the rlimit +itself. + +The default behavior remains unchanged: try to set the limit to +1,000,000 file descriptors if the current rlimit is lower. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Dr. David Alan Gilbert +Message-Id: <20200501140644.220940-2-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 6dbb716877728ce4eb51619885ef6ef4ada9565f) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 23 +++++++++++++++++++++++ + tools/virtiofsd/passthrough_ll.c | 22 ++++++++-------------- + 3 files changed, 32 insertions(+), 14 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 8f6d705..562fd52 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts { + int syslog; + int log_level; + unsigned int max_idle_threads; ++ unsigned long rlimit_nofile; + }; + + /** +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 0801cf7..9b3eddc 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -23,6 +23,8 @@ + #include + #include + #include ++#include ++#include + #include + + #define FUSE_HELPER_OPT(t, p) \ +@@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile), + FUSE_HELPER_OPT("--syslog", syslog), + FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), + FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), +@@ -171,6 +174,9 @@ void fuse_cmdline_help(void) + " default: no_writeback\n" + " -o xattr|no_xattr enable/disable xattr\n" + " default: no_xattr\n" ++ " --rlimit-nofile= set maximum number of file descriptors\n" ++ " (0 leaves rlimit unchanged)\n" ++ " default: 1,000,000 if the current rlimit is lower\n" + ); + } + +@@ -191,11 +197,28 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, + } + } + ++static unsigned long get_default_rlimit_nofile(void) ++{ ++ rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ ++ struct rlimit rlim; ++ ++ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { ++ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); ++ exit(1); ++ } ++ ++ if (rlim.rlim_cur >= max_fds) { ++ return 0; /* we have more fds available than required! */ ++ } ++ return max_fds; ++} ++ + int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + { + memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + + opts->max_idle_threads = 10; ++ opts->rlimit_nofile = get_default_rlimit_nofile(); + opts->foreground = 1; + + if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 50ff672..184ad0f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2711,24 +2711,18 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, + setup_seccomp(enable_syslog); + } + +-/* Raise the maximum number of open file descriptors */ +-static void setup_nofile_rlimit(void) ++/* Set the maximum number of open file descriptors */ ++static void setup_nofile_rlimit(unsigned long rlimit_nofile) + { +- const rlim_t max_fds = 1000000; +- struct rlimit rlim; +- +- if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { +- fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); +- exit(1); +- } ++ struct rlimit rlim = { ++ .rlim_cur = rlimit_nofile, ++ .rlim_max = rlimit_nofile, ++ }; + +- if (rlim.rlim_cur >= max_fds) { ++ if (rlimit_nofile == 0) { + return; /* nothing to do */ + } + +- rlim.rlim_cur = max_fds; +- rlim.rlim_max = max_fds; +- + if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { + /* Ignore SELinux denials */ + if (errno == EPERM) { +@@ -2981,7 +2975,7 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + +- setup_nofile_rlimit(); ++ setup_nofile_rlimit(opts.rlimit_nofile); + + /* Must be before sandbox since it wants /proc */ + setup_capng(); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-seccomp-whitelist.patch b/kvm-virtiofsd-add-seccomp-whitelist.patch new file mode 100755 index 0000000..b34108e --- /dev/null +++ b/kvm-virtiofsd-add-seccomp-whitelist.patch @@ -0,0 +1,285 @@ +From 58c4e9473b364fb62aac797b0d69fd8ddb02c8c7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:30 +0100 +Subject: [PATCH 059/116] virtiofsd: add seccomp whitelist +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-56-dgilbert@redhat.com> +Patchwork-id: 93511 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 055/112] virtiofsd: add seccomp whitelist +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Only allow system calls that are needed by virtiofsd. All other system +calls cause SIGSYS to be directed at the thread and the process will +coredump. + +Restricting system calls reduces the kernel attack surface and limits +what the process can do when compromised. + +Signed-off-by: Stefan Hajnoczi +with additional entries by: +Signed-off-by: Ganesh Maharaj Mahalingam +Signed-off-by: Masayoshi Mizuma +Signed-off-by: Misono Tomohiro +Signed-off-by: piaojun +Signed-off-by: Vivek Goyal +Signed-off-by: Eric Ren +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 4f8bde99c175ffd86b5125098a4707d43f5e80c6) + +Signed-off-by: Miroslav Rezanina +--- + Makefile | 5 +- + tools/virtiofsd/Makefile.objs | 5 +- + tools/virtiofsd/passthrough_ll.c | 2 + + tools/virtiofsd/seccomp.c | 151 +++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/seccomp.h | 14 ++++ + 5 files changed, 174 insertions(+), 3 deletions(-) + create mode 100644 tools/virtiofsd/seccomp.c + create mode 100644 tools/virtiofsd/seccomp.h + +diff --git a/Makefile b/Makefile +index 0e9755d..6879a06 100644 +--- a/Makefile ++++ b/Makefile +@@ -330,7 +330,7 @@ endif + endif + endif + +-ifdef CONFIG_LINUX ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) + HELPERS-y += virtiofsd$(EXESUF) + vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json + endif +@@ -681,7 +681,8 @@ rdmacm-mux$(EXESUF): LIBS += "-libumad" + rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) + +-ifdef CONFIG_LINUX # relies on Linux-specific syscalls ++# relies on Linux-specific syscalls ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) + virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) + $(call LINK, $^) + endif +diff --git a/tools/virtiofsd/Makefile.objs b/tools/virtiofsd/Makefile.objs +index 45a8075..076f667 100644 +--- a/tools/virtiofsd/Makefile.objs ++++ b/tools/virtiofsd/Makefile.objs +@@ -5,5 +5,8 @@ virtiofsd-obj-y = buffer.o \ + fuse_signals.o \ + fuse_virtio.o \ + helper.o \ +- passthrough_ll.o ++ passthrough_ll.o \ ++ seccomp.o + ++seccomp.o-cflags := $(SECCOMP_CFLAGS) ++seccomp.o-libs := $(SECCOMP_LIBS) +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 0947d14..bd8925b 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -59,6 +59,7 @@ + #include + + #include "passthrough_helpers.h" ++#include "seccomp.h" + + struct lo_map_elem { + union { +@@ -2091,6 +2092,7 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) + { + setup_namespaces(lo, se); + setup_mounts(lo->source); ++ setup_seccomp(); + } + + int main(int argc, char *argv[]) +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +new file mode 100644 +index 0000000..691fb63 +--- /dev/null ++++ b/tools/virtiofsd/seccomp.c +@@ -0,0 +1,151 @@ ++/* ++ * Seccomp sandboxing for virtiofsd ++ * ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#include "qemu/osdep.h" ++#include "seccomp.h" ++#include "fuse_i.h" ++#include "fuse_log.h" ++#include ++#include ++#include ++#include ++ ++/* Bodge for libseccomp 2.4.2 which broke ppoll */ ++#if !defined(__SNR_ppoll) && defined(__SNR_brk) ++#ifdef __NR_ppoll ++#define __SNR_ppoll __NR_ppoll ++#else ++#define __SNR_ppoll __PNR_ppoll ++#endif ++#endif ++ ++static const int syscall_whitelist[] = { ++ /* TODO ireg sem*() syscalls */ ++ SCMP_SYS(brk), ++ SCMP_SYS(capget), /* For CAP_FSETID */ ++ SCMP_SYS(capset), ++ SCMP_SYS(clock_gettime), ++ SCMP_SYS(clone), ++#ifdef __NR_clone3 ++ SCMP_SYS(clone3), ++#endif ++ SCMP_SYS(close), ++ SCMP_SYS(copy_file_range), ++ SCMP_SYS(dup), ++ SCMP_SYS(eventfd2), ++ SCMP_SYS(exit), ++ SCMP_SYS(exit_group), ++ SCMP_SYS(fallocate), ++ SCMP_SYS(fchmodat), ++ SCMP_SYS(fchownat), ++ SCMP_SYS(fcntl), ++ SCMP_SYS(fdatasync), ++ SCMP_SYS(fgetxattr), ++ SCMP_SYS(flistxattr), ++ SCMP_SYS(flock), ++ SCMP_SYS(fremovexattr), ++ SCMP_SYS(fsetxattr), ++ SCMP_SYS(fstat), ++ SCMP_SYS(fstatfs), ++ SCMP_SYS(fsync), ++ SCMP_SYS(ftruncate), ++ SCMP_SYS(futex), ++ SCMP_SYS(getdents), ++ SCMP_SYS(getdents64), ++ SCMP_SYS(getegid), ++ SCMP_SYS(geteuid), ++ SCMP_SYS(getpid), ++ SCMP_SYS(gettid), ++ SCMP_SYS(gettimeofday), ++ SCMP_SYS(linkat), ++ SCMP_SYS(lseek), ++ SCMP_SYS(madvise), ++ SCMP_SYS(mkdirat), ++ SCMP_SYS(mknodat), ++ SCMP_SYS(mmap), ++ SCMP_SYS(mprotect), ++ SCMP_SYS(mremap), ++ SCMP_SYS(munmap), ++ SCMP_SYS(newfstatat), ++ SCMP_SYS(open), ++ SCMP_SYS(openat), ++ SCMP_SYS(ppoll), ++ SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */ ++ SCMP_SYS(preadv), ++ SCMP_SYS(pread64), ++ SCMP_SYS(pwritev), ++ SCMP_SYS(pwrite64), ++ SCMP_SYS(read), ++ SCMP_SYS(readlinkat), ++ SCMP_SYS(recvmsg), ++ SCMP_SYS(renameat), ++ SCMP_SYS(renameat2), ++ SCMP_SYS(rt_sigaction), ++ SCMP_SYS(rt_sigprocmask), ++ SCMP_SYS(rt_sigreturn), ++ SCMP_SYS(sendmsg), ++ SCMP_SYS(setresgid), ++ SCMP_SYS(setresuid), ++#ifdef __NR_setresgid32 ++ SCMP_SYS(setresgid32), ++#endif ++#ifdef __NR_setresuid32 ++ SCMP_SYS(setresuid32), ++#endif ++ SCMP_SYS(set_robust_list), ++ SCMP_SYS(symlinkat), ++ SCMP_SYS(time), /* Rarely needed, except on static builds */ ++ SCMP_SYS(tgkill), ++ SCMP_SYS(unlinkat), ++ SCMP_SYS(utimensat), ++ SCMP_SYS(write), ++ SCMP_SYS(writev), ++}; ++ ++void setup_seccomp(void) ++{ ++ scmp_filter_ctx ctx; ++ size_t i; ++ ++#ifdef SCMP_ACT_KILL_PROCESS ++ ctx = seccomp_init(SCMP_ACT_KILL_PROCESS); ++ /* Handle a newer libseccomp but an older kernel */ ++ if (!ctx && errno == EOPNOTSUPP) { ++ ctx = seccomp_init(SCMP_ACT_TRAP); ++ } ++#else ++ ctx = seccomp_init(SCMP_ACT_TRAP); ++#endif ++ if (!ctx) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_init() failed\n"); ++ exit(1); ++ } ++ ++ for (i = 0; i < G_N_ELEMENTS(syscall_whitelist); i++) { ++ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, ++ syscall_whitelist[i], 0) != 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d", ++ syscall_whitelist[i]); ++ exit(1); ++ } ++ } ++ ++ /* libvhost-user calls this for post-copy migration, we don't need it */ ++ if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOSYS), ++ SCMP_SYS(userfaultfd), 0) != 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add userfaultfd failed\n"); ++ exit(1); ++ } ++ ++ if (seccomp_load(ctx) < 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_load() failed\n"); ++ exit(1); ++ } ++ ++ seccomp_release(ctx); ++} +diff --git a/tools/virtiofsd/seccomp.h b/tools/virtiofsd/seccomp.h +new file mode 100644 +index 0000000..86bce72 +--- /dev/null ++++ b/tools/virtiofsd/seccomp.h +@@ -0,0 +1,14 @@ ++/* ++ * Seccomp sandboxing for virtiofsd ++ * ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#ifndef VIRTIOFSD_SECCOMP_H ++#define VIRTIOFSD_SECCOMP_H ++ ++void setup_seccomp(void); ++ ++#endif /* VIRTIOFSD_SECCOMP_H */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-some-options-to-the-help-message.patch b/kvm-virtiofsd-add-some-options-to-the-help-message.patch new file mode 100755 index 0000000..ac6dc54 --- /dev/null +++ b/kvm-virtiofsd-add-some-options-to-the-help-message.patch @@ -0,0 +1,74 @@ +From 6d62abb99b6b918f05f099b01a99f4326a69d650 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:26 +0100 +Subject: [PATCH 115/116] virtiofsd: add some options to the help message +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-112-dgilbert@redhat.com> +Patchwork-id: 93565 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 111/112] virtiofsd: add some options to the help message +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +Add following options to the help message: +- cache +- flock|no_flock +- norace +- posix_lock|no_posix_lock +- readdirplus|no_readdirplus +- timeout +- writeback|no_writeback +- xattr|no_xattr + +Signed-off-by: Masayoshi Mizuma + +dgilbert: Split cache, norace, posix_lock, readdirplus off + into our own earlier patches that added the options + +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 1d59b1b210d7c3b0bdf4b10ebe0bb1fccfcb8b95) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index f98d8f2..0801cf7 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -148,6 +148,8 @@ void fuse_cmdline_help(void) + " -o cache= cache mode. could be one of \"auto, " + "always, none\"\n" + " default: auto\n" ++ " -o flock|no_flock enable/disable flock\n" ++ " default: no_flock\n" + " -o log_level= log level, default to \"info\"\n" + " level could be one of \"debug, " + "info, warn, err\"\n" +@@ -163,7 +165,13 @@ void fuse_cmdline_help(void) + " enable/disable readirplus\n" + " default: readdirplus except with " + "cache=none\n" +- ); ++ " -o timeout= I/O timeout (second)\n" ++ " default: depends on cache= option.\n" ++ " -o writeback|no_writeback enable/disable writeback cache\n" ++ " default: no_writeback\n" ++ " -o xattr|no_xattr enable/disable xattr\n" ++ " default: no_xattr\n" ++ ); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-syslog-command-line-option.patch b/kvm-virtiofsd-add-syslog-command-line-option.patch new file mode 100755 index 0000000..5b55342 --- /dev/null +++ b/kvm-virtiofsd-add-syslog-command-line-option.patch @@ -0,0 +1,239 @@ +From 6f5cf644bebc189bdb16f1caf3d7c47835d7c287 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:36 +0100 +Subject: [PATCH 065/116] virtiofsd: add --syslog command-line option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-62-dgilbert@redhat.com> +Patchwork-id: 93509 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 061/112] virtiofsd: add --syslog command-line option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Sometimes collecting output from stderr is inconvenient or does not fit +within the overall logging architecture. Add syslog(3) support for +cases where stderr cannot be used. + +Signed-off-by: Stefan Hajnoczi +dgilbert: Reworked as a logging function +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f185621d41f03a23b55795b89e6584253fa23505) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 2 ++ + tools/virtiofsd/passthrough_ll.c | 50 +++++++++++++++++++++++++++++++++++++--- + tools/virtiofsd/seccomp.c | 32 +++++++++++++++++-------- + tools/virtiofsd/seccomp.h | 4 +++- + 5 files changed, 76 insertions(+), 13 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 0d61df8..f2750bc 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1795,6 +1795,7 @@ struct fuse_cmdline_opts { + int show_version; + int show_help; + int print_capabilities; ++ int syslog; + unsigned int max_idle_threads; + }; + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5531425..9692ef9 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -54,6 +54,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_HELPER_OPT("--syslog", syslog), + FUSE_OPT_END + }; + +@@ -138,6 +139,7 @@ void fuse_cmdline_help(void) + " -V --version print version\n" + " --print-capabilities print vhost-user.json\n" + " -d -o debug enable debug output (implies -f)\n" ++ " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" + " -o max_idle_threads the maximum number of idle worker " +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c281d81..0372aca 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -58,6 +58,7 @@ + #include + #include + #include ++#include + #include + + #include "passthrough_helpers.h" +@@ -138,6 +139,7 @@ static const struct fuse_opt lo_opts[] = { + { "norace", offsetof(struct lo_data, norace), 1 }, + FUSE_OPT_END + }; ++static bool use_syslog = false; + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + +@@ -2262,11 +2264,12 @@ static void setup_mounts(const char *source) + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +-static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) ++static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, ++ bool enable_syslog) + { + setup_namespaces(lo, se); + setup_mounts(lo->source); +- setup_seccomp(); ++ setup_seccomp(enable_syslog); + } + + /* Raise the maximum number of open file descriptors */ +@@ -2298,6 +2301,42 @@ static void setup_nofile_rlimit(void) + } + } + ++static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) ++{ ++ if (use_syslog) { ++ int priority = LOG_ERR; ++ switch (level) { ++ case FUSE_LOG_EMERG: ++ priority = LOG_EMERG; ++ break; ++ case FUSE_LOG_ALERT: ++ priority = LOG_ALERT; ++ break; ++ case FUSE_LOG_CRIT: ++ priority = LOG_CRIT; ++ break; ++ case FUSE_LOG_ERR: ++ priority = LOG_ERR; ++ break; ++ case FUSE_LOG_WARNING: ++ priority = LOG_WARNING; ++ break; ++ case FUSE_LOG_NOTICE: ++ priority = LOG_NOTICE; ++ break; ++ case FUSE_LOG_INFO: ++ priority = LOG_INFO; ++ break; ++ case FUSE_LOG_DEBUG: ++ priority = LOG_DEBUG; ++ break; ++ } ++ vsyslog(priority, fmt, ap); ++ } else { ++ vfprintf(stderr, fmt, ap); ++ } ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2336,6 +2375,11 @@ int main(int argc, char *argv[]) + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; + } ++ fuse_set_log_func(log_func); ++ use_syslog = opts.syslog; ++ if (use_syslog) { ++ openlog("virtiofsd", LOG_PID, LOG_DAEMON); ++ } + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); +@@ -2424,7 +2468,7 @@ int main(int argc, char *argv[]) + /* Must be before sandbox since it wants /proc */ + setup_capng(); + +- setup_sandbox(&lo, se); ++ setup_sandbox(&lo, se, opts.syslog); + + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +index 691fb63..2d9d4a7 100644 +--- a/tools/virtiofsd/seccomp.c ++++ b/tools/virtiofsd/seccomp.c +@@ -107,11 +107,28 @@ static const int syscall_whitelist[] = { + SCMP_SYS(writev), + }; + +-void setup_seccomp(void) ++/* Syscalls used when --syslog is enabled */ ++static const int syscall_whitelist_syslog[] = { ++ SCMP_SYS(sendto), ++}; ++ ++static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len) + { +- scmp_filter_ctx ctx; + size_t i; + ++ for (i = 0; i < len; i++) { ++ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscalls[i], 0) != 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d failed\n", ++ syscalls[i]); ++ exit(1); ++ } ++ } ++} ++ ++void setup_seccomp(bool enable_syslog) ++{ ++ scmp_filter_ctx ctx; ++ + #ifdef SCMP_ACT_KILL_PROCESS + ctx = seccomp_init(SCMP_ACT_KILL_PROCESS); + /* Handle a newer libseccomp but an older kernel */ +@@ -126,13 +143,10 @@ void setup_seccomp(void) + exit(1); + } + +- for (i = 0; i < G_N_ELEMENTS(syscall_whitelist); i++) { +- if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, +- syscall_whitelist[i], 0) != 0) { +- fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d", +- syscall_whitelist[i]); +- exit(1); +- } ++ add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist)); ++ if (enable_syslog) { ++ add_whitelist(ctx, syscall_whitelist_syslog, ++ G_N_ELEMENTS(syscall_whitelist_syslog)); + } + + /* libvhost-user calls this for post-copy migration, we don't need it */ +diff --git a/tools/virtiofsd/seccomp.h b/tools/virtiofsd/seccomp.h +index 86bce72..d47c8ea 100644 +--- a/tools/virtiofsd/seccomp.h ++++ b/tools/virtiofsd/seccomp.h +@@ -9,6 +9,8 @@ + #ifndef VIRTIOFSD_SECCOMP_H + #define VIRTIOFSD_SECCOMP_H + +-void setup_seccomp(void); ++#include ++ ++void setup_seccomp(bool enable_syslog); + + #endif /* VIRTIOFSD_SECCOMP_H */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-thread-pool-size-NUM-option.patch b/kvm-virtiofsd-add-thread-pool-size-NUM-option.patch new file mode 100755 index 0000000..0241a9d --- /dev/null +++ b/kvm-virtiofsd-add-thread-pool-size-NUM-option.patch @@ -0,0 +1,106 @@ +From 3dbfb932288eb5a55dfdc0eebca7e4c7f0cf6f33 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:22 +0100 +Subject: [PATCH 111/116] virtiofsd: add --thread-pool-size=NUM option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-108-dgilbert@redhat.com> +Patchwork-id: 93561 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 107/112] virtiofsd: add --thread-pool-size=NUM option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Add an option to control the size of the thread pool. Requests are now +processed in parallel by default. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 951b3120dbc971f08681e1d860360e4a1e638902) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 7 ++++++- + tools/virtiofsd/fuse_virtio.c | 5 +++-- + 3 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 1447d86..4e47e58 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -72,6 +72,7 @@ struct fuse_session { + int vu_listen_fd; + int vu_socketfd; + struct fv_VuDev *virtio_dev; ++ int thread_pool_size; + }; + + struct fuse_chan { +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 79a4031..de2e2e0 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -28,6 +28,7 @@ + #include + #include + ++#define THREAD_POOL_SIZE 64 + + #define OFFSET_MAX 0x7fffffffffffffffLL + +@@ -2519,6 +2520,7 @@ static const struct fuse_opt fuse_ll_opts[] = { + LL_OPTION("allow_root", deny_others, 1), + LL_OPTION("--socket-path=%s", vu_socket_path, 0), + LL_OPTION("--fd=%d", vu_listen_fd, 0), ++ LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0), + FUSE_OPT_END + }; + +@@ -2537,7 +2539,9 @@ void fuse_lowlevel_help(void) + printf( + " -o allow_root allow access by root\n" + " --socket-path=PATH path for the vhost-user socket\n" +- " --fd=FDNUM fd number of vhost-user socket\n"); ++ " --fd=FDNUM fd number of vhost-user socket\n" ++ " --thread-pool-size=NUM thread pool size limit (default %d)\n", ++ THREAD_POOL_SIZE); + } + + void fuse_session_destroy(struct fuse_session *se) +@@ -2591,6 +2595,7 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + } + se->fd = -1; + se->vu_listen_fd = -1; ++ se->thread_pool_size = THREAD_POOL_SIZE; + se->conn.max_write = UINT_MAX; + se->conn.max_readahead = UINT_MAX; + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 0dcf2ef..9f65823 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -572,10 +572,11 @@ static void *fv_queue_thread(void *opaque) + struct fv_QueueInfo *qi = opaque; + struct VuDev *dev = &qi->virtio_dev->dev; + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ struct fuse_session *se = qi->virtio_dev->se; + GThreadPool *pool; + +- pool = g_thread_pool_new(fv_queue_worker, qi, 1 /* TODO max_threads */, +- TRUE, NULL); ++ pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, TRUE, ++ NULL); + if (!pool) { + fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); + return NULL; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-vhost-user.json-file.patch b/kvm-virtiofsd-add-vhost-user.json-file.patch new file mode 100755 index 0000000..a24b24f --- /dev/null +++ b/kvm-virtiofsd-add-vhost-user.json-file.patch @@ -0,0 +1,73 @@ +From 77eb3258e76a1ac240503572d4f41d45cb832ba2 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:09 +0100 +Subject: [PATCH 038/116] virtiofsd: add vhost-user.json file +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-35-dgilbert@redhat.com> +Patchwork-id: 93490 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 034/112] virtiofsd: add vhost-user.json file +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Install a vhost-user.json file describing virtiofsd. This allows +libvirt and other management tools to enumerate vhost-user backend +programs. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 315616ed50ba15a5d7236ade8a402a93898202de) +Signed-off-by: Miroslav Rezanina +--- + .gitignore | 1 + + Makefile | 1 + + tools/virtiofsd/50-qemu-virtiofsd.json.in | 5 +++++ + 3 files changed, 7 insertions(+) + create mode 100644 tools/virtiofsd/50-qemu-virtiofsd.json.in + +diff --git a/.gitignore b/.gitignore +index aefad32..d7a4f99 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -6,6 +6,7 @@ + /config-target.* + /config.status + /config-temp ++/tools/virtiofsd/50-qemu-virtiofsd.json + /elf2dmp + /trace-events-all + /trace/generated-events.h +diff --git a/Makefile b/Makefile +index 1526775..0e9755d 100644 +--- a/Makefile ++++ b/Makefile +@@ -332,6 +332,7 @@ endif + + ifdef CONFIG_LINUX + HELPERS-y += virtiofsd$(EXESUF) ++vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json + endif + + # Sphinx does not allow building manuals into the same directory as +diff --git a/tools/virtiofsd/50-qemu-virtiofsd.json.in b/tools/virtiofsd/50-qemu-virtiofsd.json.in +new file mode 100644 +index 0000000..9bcd86f +--- /dev/null ++++ b/tools/virtiofsd/50-qemu-virtiofsd.json.in +@@ -0,0 +1,5 @@ ++{ ++ "description": "QEMU virtiofsd vhost-user-fs", ++ "type": "fs", ++ "binary": "@libexecdir@/virtiofsd" ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-cap-ng-helpers.patch b/kvm-virtiofsd-cap-ng-helpers.patch new file mode 100755 index 0000000..305745d --- /dev/null +++ b/kvm-virtiofsd-cap-ng-helpers.patch @@ -0,0 +1,175 @@ +From f62613d8058bcb60b26727d980a37537103b0033 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:32 +0100 +Subject: [PATCH 061/116] virtiofsd: cap-ng helpers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-58-dgilbert@redhat.com> +Patchwork-id: 93512 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 057/112] virtiofsd: cap-ng helpers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +libcap-ng reads /proc during capng_get_caps_process, and virtiofsd's +sandboxing doesn't have /proc mounted; thus we have to do the +caps read before we sandbox it and save/restore the state. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2405f3c0d19eb4d516a88aa4e5c54e5f9c6bbea3) +Signed-off-by: Miroslav Rezanina +--- + Makefile | 4 +-- + tools/virtiofsd/passthrough_ll.c | 72 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 74 insertions(+), 2 deletions(-) + +diff --git a/Makefile b/Makefile +index 6879a06..ff05c30 100644 +--- a/Makefile ++++ b/Makefile +@@ -330,7 +330,7 @@ endif + endif + endif + +-ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy) + HELPERS-y += virtiofsd$(EXESUF) + vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json + endif +@@ -682,7 +682,7 @@ rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) + + # relies on Linux-specific syscalls +-ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy) + virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) + $(call LINK, $^) + endif +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index bd8925b..97e7c75 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -39,6 +39,7 @@ + #include "fuse_virtio.h" + #include "fuse_lowlevel.h" + #include ++#include + #include + #include + #include +@@ -139,6 +140,13 @@ static const struct fuse_opt lo_opts[] = { + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + ++static struct { ++ pthread_mutex_t mutex; ++ void *saved; ++} cap; ++/* That we loaded cap-ng in the current thread from the saved */ ++static __thread bool cap_loaded = 0; ++ + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); + + static int is_dot_or_dotdot(const char *name) +@@ -162,6 +170,37 @@ static struct lo_data *lo_data(fuse_req_t req) + return (struct lo_data *)fuse_req_userdata(req); + } + ++/* ++ * Load capng's state from our saved state if the current thread ++ * hadn't previously been loaded. ++ * returns 0 on success ++ */ ++static int load_capng(void) ++{ ++ if (!cap_loaded) { ++ pthread_mutex_lock(&cap.mutex); ++ capng_restore_state(&cap.saved); ++ /* ++ * restore_state free's the saved copy ++ * so make another. ++ */ ++ cap.saved = capng_save_state(); ++ if (!cap.saved) { ++ fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); ++ return -EINVAL; ++ } ++ pthread_mutex_unlock(&cap.mutex); ++ ++ /* ++ * We want to use the loaded state for our pid, ++ * not the original ++ */ ++ capng_setpid(syscall(SYS_gettid)); ++ cap_loaded = true; ++ } ++ return 0; ++} ++ + static void lo_map_init(struct lo_map *map) + { + map->elems = NULL; +@@ -2024,6 +2063,35 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + } + + /* ++ * Capture the capability state, we'll need to restore this for individual ++ * threads later; see load_capng. ++ */ ++static void setup_capng(void) ++{ ++ /* Note this accesses /proc so has to happen before the sandbox */ ++ if (capng_get_caps_process()) { ++ fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n"); ++ exit(1); ++ } ++ pthread_mutex_init(&cap.mutex, NULL); ++ pthread_mutex_lock(&cap.mutex); ++ cap.saved = capng_save_state(); ++ if (!cap.saved) { ++ fuse_log(FUSE_LOG_ERR, "capng_save_state\n"); ++ exit(1); ++ } ++ pthread_mutex_unlock(&cap.mutex); ++} ++ ++static void cleanup_capng(void) ++{ ++ free(cap.saved); ++ cap.saved = NULL; ++ pthread_mutex_destroy(&cap.mutex); ++} ++ ++ ++/* + * Make the source directory our root so symlinks cannot escape and no other + * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. + */ +@@ -2216,12 +2284,16 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + ++ /* Must be before sandbox since it wants /proc */ ++ setup_capng(); ++ + setup_sandbox(&lo, se); + + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + + fuse_session_unmount(se); ++ cleanup_capng(); + err_out3: + fuse_remove_signal_handlers(se); + err_out2: +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch b/kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch new file mode 100755 index 0000000..caa4560 --- /dev/null +++ b/kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch @@ -0,0 +1,1111 @@ +From d6a0067e6c08523a8f605f775be980eaf0a23690 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:23 +0100 +Subject: [PATCH 052/116] virtiofsd: check input buffer size in fuse_lowlevel.c + ops +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-49-dgilbert@redhat.com> +Patchwork-id: 93503 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 048/112] virtiofsd: check input buffer size in fuse_lowlevel.c ops +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Each FUSE operation involves parsing the input buffer. Currently the +code assumes the input buffer is large enough for the expected +arguments. This patch uses fuse_mbuf_iter to check the size. + +Most operations are simple to convert. Some are more complicated due to +variable-length inputs or different sizes depending on the protocol +version. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 70995754416eb4491c31607fe380a83cfd25a087) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 581 +++++++++++++++++++++++++++++++--------- + 1 file changed, 456 insertions(+), 125 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 611e8b0..02e1d83 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -27,7 +28,6 @@ + #include + + +-#define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + #define OFFSET_MAX 0x7fffffffffffffffLL + + struct fuse_pollhandle { +@@ -706,9 +706,14 @@ int fuse_reply_lseek(fuse_req_t req, off_t off) + return send_reply_ok(req, &arg, sizeof(arg)); + } + +-static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.lookup) { + req->se->op.lookup(req, nodeid, name); +@@ -717,9 +722,16 @@ static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_forget(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_forget_in *arg = (struct fuse_forget_in *)inarg; ++ struct fuse_forget_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.forget) { + req->se->op.forget(req, nodeid, arg->nlookup); +@@ -729,20 +741,48 @@ static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg) ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_batch_forget_in *arg = (void *)inarg; +- struct fuse_forget_one *param = (void *)PARAM(arg); +- unsigned int i; ++ struct fuse_batch_forget_in *arg; ++ struct fuse_forget_data *forgets; ++ size_t scount; + + (void)nodeid; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_none(req); ++ return; ++ } ++ ++ /* ++ * Prevent integer overflow. The compiler emits the following warning ++ * unless we use the scount local variable: ++ * ++ * error: comparison is always false due to limited range of data type ++ * [-Werror=type-limits] ++ * ++ * This may be true on 64-bit hosts but we need this check for 32-bit ++ * hosts. ++ */ ++ scount = arg->count; ++ if (scount > SIZE_MAX / sizeof(forgets[0])) { ++ fuse_reply_none(req); ++ return; ++ } ++ ++ forgets = fuse_mbuf_iter_advance(iter, arg->count * sizeof(forgets[0])); ++ if (!forgets) { ++ fuse_reply_none(req); ++ return; ++ } ++ + if (req->se->op.forget_multi) { +- req->se->op.forget_multi(req, arg->count, +- (struct fuse_forget_data *)param); ++ req->se->op.forget_multi(req, arg->count, forgets); + } else if (req->se->op.forget) { ++ unsigned int i; ++ + for (i = 0; i < arg->count; i++) { +- struct fuse_forget_one *forget = ¶m[i]; + struct fuse_req *dummy_req; + + dummy_req = fuse_ll_alloc_req(req->se); +@@ -754,7 +794,7 @@ static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, + dummy_req->ctx = req->ctx; + dummy_req->ch = NULL; + +- req->se->op.forget(dummy_req, forget->nodeid, forget->nlookup); ++ req->se->op.forget(dummy_req, forgets[i].ino, forgets[i].nlookup); + } + fuse_reply_none(req); + } else { +@@ -762,12 +802,19 @@ static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, + } + } + +-static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { + struct fuse_file_info *fip = NULL; + struct fuse_file_info fi; + +- struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; ++ struct fuse_getattr_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (arg->getattr_flags & FUSE_GETATTR_FH) { + memset(&fi, 0, sizeof(fi)); +@@ -782,14 +829,21 @@ static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_setattr_in *arg = (struct fuse_setattr_in *)inarg; +- + if (req->se->op.setattr) { ++ struct fuse_setattr_in *arg; + struct fuse_file_info *fi = NULL; + struct fuse_file_info fi_store; + struct stat stbuf; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&stbuf, 0, sizeof(stbuf)); + convert_attr(arg, &stbuf); + if (arg->valid & FATTR_FH) { +@@ -810,9 +864,16 @@ static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_access(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_access_in *arg = (struct fuse_access_in *)inarg; ++ struct fuse_access_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.access) { + req->se->op.access(req, nodeid, arg->mask); +@@ -821,9 +882,10 @@ static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- (void)inarg; ++ (void)iter; + + if (req->se->op.readlink) { + req->se->op.readlink(req, nodeid); +@@ -832,10 +894,18 @@ static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_mknod_in *arg = (struct fuse_mknod_in *)inarg; +- char *name = PARAM(arg); ++ struct fuse_mknod_in *arg; ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + req->ctx.umask = arg->umask; + +@@ -846,22 +916,37 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *)inarg; ++ struct fuse_mkdir_in *arg; ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + req->ctx.umask = arg->umask; + + if (req->se->op.mkdir) { +- req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); ++ req->se->op.mkdir(req, nodeid, name, arg->mode); + } else { + fuse_reply_err(req, ENOSYS); + } + } + +-static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.unlink) { + req->se->op.unlink(req, nodeid, name); +@@ -870,9 +955,15 @@ static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.rmdir) { + req->se->op.rmdir(req, nodeid, name); +@@ -881,10 +972,16 @@ static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; +- char *linkname = ((char *)inarg) + strlen((char *)inarg) + 1; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ const char *linkname = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name || !linkname) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.symlink) { + req->se->op.symlink(req, linkname, nodeid, name); +@@ -893,11 +990,20 @@ static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_rename(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_rename_in *arg = (struct fuse_rename_in *)inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename_in *arg; ++ const char *oldname; ++ const char *newname; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ oldname = fuse_mbuf_iter_advance_str(iter); ++ newname = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !oldname || !newname) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0); +@@ -906,11 +1012,20 @@ static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_rename2_in *arg = (struct fuse_rename2_in *)inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename2_in *arg; ++ const char *oldname; ++ const char *newname; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ oldname = fuse_mbuf_iter_advance_str(iter); ++ newname = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !oldname || !newname) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, +@@ -920,24 +1035,38 @@ static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_link(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_link(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_link_in *arg = (struct fuse_link_in *)inarg; ++ struct fuse_link_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.link) { +- req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); ++ req->se->op.link(req, arg->oldnodeid, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } + } + +-static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_create(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_create_in *arg = (struct fuse_create_in *)inarg; +- + if (req->se->op.create) { ++ struct fuse_create_in *arg; + struct fuse_file_info fi; +- char *name = PARAM(arg); ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; +@@ -950,11 +1079,18 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_open(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_open_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + +@@ -965,13 +1101,15 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_read(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_read_in *arg = (struct fuse_read_in *)inarg; +- + if (req->se->op.read) { ++ struct fuse_read_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->lock_owner; +@@ -982,11 +1120,24 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_write(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_write_in *arg; + struct fuse_file_info fi; +- char *param; ++ const char *param; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ param = fuse_mbuf_iter_advance(iter, arg->size); ++ if (!param) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +@@ -994,7 +1145,6 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; +- param = PARAM(arg); + + if (req->se->op.write) { + req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); +@@ -1052,11 +1202,18 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, + se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); + } + +-static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_flush(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_flush_in *arg = (struct fuse_flush_in *)inarg; ++ struct fuse_flush_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.flush = 1; +@@ -1069,19 +1226,26 @@ static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_release(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_release_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; + fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; + fi.lock_owner = arg->lock_owner; ++ + if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { + fi.flock_release = 1; +- fi.lock_owner = arg->lock_owner; + } + + if (req->se->op.release) { +@@ -1091,11 +1255,19 @@ static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_fsync_in *arg; + struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ int datasync; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +@@ -1111,11 +1283,18 @@ static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_open_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + +@@ -1126,11 +1305,18 @@ static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_read_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1141,11 +1327,18 @@ static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_read_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1156,11 +1349,18 @@ static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_release_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; +@@ -1172,11 +1372,19 @@ static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_fsync_in *arg; + struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ int datasync; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +@@ -1188,10 +1396,11 @@ static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { + (void)nodeid; +- (void)inarg; ++ (void)iter; + + if (req->se->op.statfs) { + req->se->op.statfs(req, nodeid); +@@ -1204,11 +1413,25 @@ static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *)inarg; +- char *name = PARAM(arg); +- char *value = name + strlen(name) + 1; ++ struct fuse_setxattr_in *arg; ++ const char *name; ++ const char *value; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ value = fuse_mbuf_iter_advance(iter, arg->size); ++ if (!value) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.setxattr) { + req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags); +@@ -1217,20 +1440,36 @@ static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; ++ struct fuse_getxattr_in *arg; ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.getxattr) { +- req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); ++ req->se->op.getxattr(req, nodeid, name, arg->size); + } else { + fuse_reply_err(req, ENOSYS); + } + } + +-static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; ++ struct fuse_getxattr_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.listxattr) { + req->se->op.listxattr(req, nodeid, arg->size); +@@ -1239,9 +1478,15 @@ static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.removexattr) { + req->se->op.removexattr(req, nodeid, name); +@@ -1265,12 +1510,19 @@ static void convert_fuse_file_lock(struct fuse_file_lock *fl, + flock->l_pid = fl->pid; + } + +-static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; +@@ -1284,12 +1536,18 @@ static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, int sleep) ++ struct fuse_mbuf_iter *iter, int sleep) + { +- struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; +@@ -1327,14 +1585,16 @@ static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, + } + } + +-static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- do_setlk_common(req, nodeid, inarg, 0); ++ do_setlk_common(req, nodeid, iter, 0); + } + +-static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- do_setlk_common(req, nodeid, inarg, 1); ++ do_setlk_common(req, nodeid, iter, 1); + } + + static int find_interrupted(struct fuse_session *se, struct fuse_req *req) +@@ -1379,12 +1639,20 @@ static int find_interrupted(struct fuse_session *se, struct fuse_req *req) + return 0; + } + +-static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *)inarg; ++ struct fuse_interrupt_in *arg; + struct fuse_session *se = req->se; + + (void)nodeid; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", + (unsigned long long)arg->unique); +@@ -1425,9 +1693,15 @@ static struct fuse_req *check_interrupt(struct fuse_session *se, + } + } + +-static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_bmap_in *arg = (struct fuse_bmap_in *)inarg; ++ struct fuse_bmap_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.bmap) { + req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); +@@ -1436,18 +1710,34 @@ static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *)inarg; +- unsigned int flags = arg->flags; +- void *in_buf = arg->in_size ? PARAM(arg) : NULL; ++ struct fuse_ioctl_in *arg; ++ unsigned int flags; ++ void *in_buf = NULL; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ flags = arg->flags; + if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { + fuse_reply_err(req, ENOTTY); + return; + } + ++ if (arg->in_size) { ++ in_buf = fuse_mbuf_iter_advance(iter, arg->in_size); ++ if (!in_buf) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1468,11 +1758,18 @@ void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) + free(ph); + } + +-static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_poll(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_poll_in *arg = (struct fuse_poll_in *)inarg; ++ struct fuse_poll_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.poll_events = arg->events; +@@ -1496,11 +1793,18 @@ static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *)inarg; ++ struct fuse_fallocate_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1513,12 +1817,17 @@ static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, +- const void *inarg) ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_copy_file_range_in *arg = +- (struct fuse_copy_file_range_in *)inarg; ++ struct fuse_copy_file_range_in *arg; + struct fuse_file_info fi_in, fi_out; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi_in, 0, sizeof(fi_in)); + fi_in.fh = arg->fh_in; + +@@ -1535,11 +1844,17 @@ static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, + } + } + +-static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_lseek_in *arg = (struct fuse_lseek_in *)inarg; ++ struct fuse_lseek_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1550,15 +1865,33 @@ static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_init(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_init_in *arg = (struct fuse_init_in *)inarg; ++ size_t compat_size = offsetof(struct fuse_init_in, max_readahead); ++ struct fuse_init_in *arg; + struct fuse_init_out outarg; + struct fuse_session *se = req->se; + size_t bufsize = se->bufsize; + size_t outargsize = sizeof(outarg); + + (void)nodeid; ++ ++ /* First consume the old fields... */ ++ arg = fuse_mbuf_iter_advance(iter, compat_size); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ /* ...and now consume the new fields. */ ++ if (arg->major == 7 && arg->minor >= 6) { ++ if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ } ++ + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); + if (arg->major == 7 && arg->minor >= 6) { +@@ -1791,12 +2124,13 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + send_reply_ok(req, &outarg, outargsize); + } + +-static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { + struct fuse_session *se = req->se; + + (void)nodeid; +- (void)inarg; ++ (void)iter; + + se->got_destroy = 1; + if (se->op.destroy) { +@@ -1976,7 +2310,7 @@ int fuse_req_interrupted(fuse_req_t req) + } + + static struct { +- void (*func)(fuse_req_t, fuse_ino_t, const void *); ++ void (*func)(fuse_req_t, fuse_ino_t, struct fuse_mbuf_iter *); + const char *name; + } fuse_ll_ops[] = { + [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, +@@ -2060,7 +2394,6 @@ void fuse_session_process_buf_int(struct fuse_session *se, + const struct fuse_buf *buf = bufv->buf; + struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf); + struct fuse_in_header *in; +- const void *inarg; + struct fuse_req *req; + int err; + +@@ -2138,13 +2471,11 @@ void fuse_session_process_buf_int(struct fuse_session *se, + } + } + +- inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { + do_write_buf(req, in->nodeid, &iter, bufv); + } else { +- fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); ++ fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter); + } +- + return; + + reply_err: +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-cleanup-allocated-resource-in-se.patch b/kvm-virtiofsd-cleanup-allocated-resource-in-se.patch new file mode 100755 index 0000000..b6de0a9 --- /dev/null +++ b/kvm-virtiofsd-cleanup-allocated-resource-in-se.patch @@ -0,0 +1,82 @@ +From 99ff67682ef7c5659bdc9836008541861ae313d5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:56 +0100 +Subject: [PATCH 085/116] virtiofsd: cleanup allocated resource in se +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-82-dgilbert@redhat.com> +Patchwork-id: 93533 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 081/112] virtiofsd: cleanup allocated resource in se +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +This cleans up unfreed resources in se on quiting, including +se->virtio_dev, se->vu_socket_path, se->vu_socketfd. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 61cfc44982e566c33b9d5df17858e4d5ae373873) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 7 +++++++ + tools/virtiofsd/fuse_virtio.c | 7 +++++++ + tools/virtiofsd/fuse_virtio.h | 2 +- + 3 files changed, 15 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 65f91da..440508a 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2532,6 +2532,13 @@ void fuse_session_destroy(struct fuse_session *se) + if (se->fd != -1) { + close(se->fd); + } ++ ++ if (se->vu_socket_path) { ++ virtio_session_close(se); ++ free(se->vu_socket_path); ++ se->vu_socket_path = NULL; ++ } ++ + free(se); + } + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 7a8774a..e7bd772 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -833,3 +833,10 @@ int virtio_session_mount(struct fuse_session *se) + + return 0; + } ++ ++void virtio_session_close(struct fuse_session *se) ++{ ++ close(se->vu_socketfd); ++ free(se->virtio_dev); ++ se->virtio_dev = NULL; ++} +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index cc676b9..1116840 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -19,7 +19,7 @@ + struct fuse_session; + + int virtio_session_mount(struct fuse_session *se); +- ++void virtio_session_close(struct fuse_session *se); + int virtio_loop(struct fuse_session *se); + + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch b/kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch new file mode 100755 index 0000000..d01b000 --- /dev/null +++ b/kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch @@ -0,0 +1,99 @@ +From e00543b0384fba61a9c7274c73e11a25e7ab2946 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:13 +0100 +Subject: [PATCH 102/116] virtiofsd: convert more fprintf and perror to use + fuse log infra +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-99-dgilbert@redhat.com> +Patchwork-id: 93552 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 098/112] virtiofsd: convert more fprintf and perror to use fuse log infra +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eryu Guan + +Signed-off-by: Eryu Guan +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Misono Tomohiro +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit fc1aed0bf96259d0b46b1cfea7497b7762c4ee3d) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_signals.c | 7 +++++-- + tools/virtiofsd/helper.c | 9 ++++++--- + 2 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +index dc7c8ac..f18625b 100644 +--- a/tools/virtiofsd/fuse_signals.c ++++ b/tools/virtiofsd/fuse_signals.c +@@ -12,6 +12,7 @@ + #include "fuse_i.h" + #include "fuse_lowlevel.h" + ++#include + #include + #include + #include +@@ -47,13 +48,15 @@ static int set_one_signal_handler(int sig, void (*handler)(int), int remove) + sa.sa_flags = 0; + + if (sigaction(sig, NULL, &old_sa) == -1) { +- perror("fuse: cannot get old signal handler"); ++ fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n", ++ strerror(errno)); + return -1; + } + + if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && + sigaction(sig, &sa, NULL) == -1) { +- perror("fuse: cannot set signal handler"); ++ fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n", ++ strerror(errno)); + return -1; + } + return 0; +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 33749bf..f98d8f2 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -208,7 +208,8 @@ int fuse_daemonize(int foreground) + char completed; + + if (pipe(waiter)) { +- perror("fuse_daemonize: pipe"); ++ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n", ++ strerror(errno)); + return -1; + } + +@@ -218,7 +219,8 @@ int fuse_daemonize(int foreground) + */ + switch (fork()) { + case -1: +- perror("fuse_daemonize: fork"); ++ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n", ++ strerror(errno)); + return -1; + case 0: + break; +@@ -228,7 +230,8 @@ int fuse_daemonize(int foreground) + } + + if (setsid() == -1) { +- perror("fuse_daemonize: setsid"); ++ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n", ++ strerror(errno)); + return -1; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch b/kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch new file mode 100755 index 0000000..8c1022a --- /dev/null +++ b/kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch @@ -0,0 +1,57 @@ +From 8e6473e906dfc7d2a62abaf1ec80ff461e4d201d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:12 +0100 +Subject: [PATCH 101/116] virtiofsd: do not always set FUSE_FLOCK_LOCKS +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-98-dgilbert@redhat.com> +Patchwork-id: 93551 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 097/112] virtiofsd: do not always set FUSE_FLOCK_LOCKS +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Peng Tao + +Right now we always enable it regardless of given commandlines. +Fix it by setting the flag relying on the lo->flock bit. + +Signed-off-by: Peng Tao +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e468d4af5f5192ab33283464a9f6933044ce47f7) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ab16135..ccbbec1 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -546,9 +546,14 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); + conn->want |= FUSE_CAP_WRITEBACK_CACHE; + } +- if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); +- conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ if (conn->capable & FUSE_CAP_FLOCK_LOCKS) { ++ if (lo->flock) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); ++ conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n"); ++ conn->want &= ~FUSE_CAP_FLOCK_LOCKS; ++ } + } + + if (conn->capable & FUSE_CAP_POSIX_LOCKS) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-do_read-missing-NULL-check.patch b/kvm-virtiofsd-do_read-missing-NULL-check.patch new file mode 100755 index 0000000..4f8e5ef --- /dev/null +++ b/kvm-virtiofsd-do_read-missing-NULL-check.patch @@ -0,0 +1,49 @@ +From 901c005299b0316bbca7bc190de56f6c7a2a9880 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:11 +0000 +Subject: [PATCH 15/18] virtiofsd: do_read missing NULL check +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-5-dgilbert@redhat.com> +Patchwork-id: 94127 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/7] virtiofsd: do_read missing NULL check +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +Missing a NULL check if the argument fetch fails. + +Fixes: Coverity CID 1413119 +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 99ce9a7e60fd12b213b985343ff8fcc172de59fd) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_lowlevel.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 01c418a..704c036 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1116,6 +1116,10 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch b/kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch new file mode 100755 index 0000000..569096d --- /dev/null +++ b/kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch @@ -0,0 +1,67 @@ +From 78152453940967f9ece9fe3ffc5017c669d6ec28 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:36:00 +0100 +Subject: [PATCH 9/9] virtiofsd: drop all capabilities in the wait parent + process +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-8-dgilbert@redhat.com> +Patchwork-id: 96274 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 7/7] virtiofsd: drop all capabilities in the wait parent process +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +All this process does is wait for its child. No capabilities are +needed. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 66502bbca37ca7a3bfa57e82cfc03b89a7a11eae) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 6358874..f41a6b0 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2535,6 +2535,17 @@ static void print_capabilities(void) + } + + /* ++ * Drop all Linux capabilities because the wait parent process only needs to ++ * sit in waitpid(2) and terminate. ++ */ ++static void setup_wait_parent_capabilities(void) ++{ ++ capng_setpid(syscall(SYS_gettid)); ++ capng_clear(CAPNG_SELECT_BOTH); ++ capng_apply(CAPNG_SELECT_BOTH); ++} ++ ++/* + * Move to a new mount, net, and pid namespaces to isolate this process. + */ + static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) +@@ -2567,6 +2578,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + pid_t waited; + int wstatus; + ++ setup_wait_parent_capabilities(); ++ + /* The parent waits for the child */ + do { + waited = waitpid(child, &wstatus, 0); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch b/kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch new file mode 100755 index 0000000..3279a5e --- /dev/null +++ b/kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch @@ -0,0 +1,47 @@ +From bc127914b29f2e4163bc7ca786e04ed955d96016 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:00 +0100 +Subject: [PATCH 089/116] virtiofsd: enable PARALLEL_DIROPS during INIT +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-86-dgilbert@redhat.com> +Patchwork-id: 93539 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 085/112] virtiofsd: enable PARALLEL_DIROPS during INIT +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +lookup is a RO operations, PARALLEL_DIROPS can be enabled. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b7ed733a3841c4d489d3bd6ca7ed23c84db119c2) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index aac282f..70568d2 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2062,6 +2062,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + if (se->conn.want & FUSE_CAP_ASYNC_READ) { + outarg.flags |= FUSE_ASYNC_READ; + } ++ if (se->conn.want & FUSE_CAP_PARALLEL_DIROPS) { ++ outarg.flags |= FUSE_PARALLEL_DIROPS; ++ } + if (se->conn.want & FUSE_CAP_POSIX_LOCKS) { + outarg.flags |= FUSE_POSIX_LOCKS; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch b/kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch new file mode 100755 index 0000000..b0f678f --- /dev/null +++ b/kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch @@ -0,0 +1,167 @@ +From c02ebc7e43f55b9423a065a7c53ba72bdb821c98 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 9 Feb 2021 23:14:54 -0500 +Subject: [PATCH 1/3] virtiofsd: extract lo_do_open() from lo_open() + +RH-Author: Jon Maloy +Message-id: <20210209231456.1555472-2-jmaloy@redhat.com> +Patchwork-id: 101024 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/3] virtiofsd: extract lo_do_open() from lo_open() +Bugzilla: 1919111 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Greg Kurz +RH-Acked-by: Dr. David Alan Gilbert + +From: Stefan Hajnoczi + +Both lo_open() and lo_create() have similar code to open a file. Extract +a common lo_do_open() function from lo_open() that will be used by +lo_create() in a later commit. + +Since lo_do_open() does not otherwise need fuse_req_t req, convert +lo_add_fd_mapping() to use struct lo_data *lo instead. + +Signed-off-by: Stefan Hajnoczi +Message-Id: <20210204150208.367837-2-stefanha@redhat.com> +Reviewed-by: Greg Kurz +Signed-off-by: Dr. David Alan Gilbert + +(cherry-picked from commit 8afaaee976965b7fb90ec225a51d60f35c5f173c) + +Conflict: update_open_flags() takes fewer arguments in this version + than in upstream. Instead of applying commit e12a0edafeb + ("virtiofsd: Add -o allow_direct_io|no_allow_direct_io + options") we keep the old signature, since this seems to + be an unrelated change. + +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + tools/virtiofsd/passthrough_ll.c | 73 ++++++++++++++++++++------------ + 1 file changed, 46 insertions(+), 27 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index f41a6b07c8..518ba11c47 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -439,17 +439,17 @@ static void lo_map_remove(struct lo_map *map, size_t key) + } + + /* Assumes lo->mutex is held */ +-static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) ++static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd) + { + struct lo_map_elem *elem; + +- elem = lo_map_alloc_elem(&lo_data(req)->fd_map); ++ elem = lo_map_alloc_elem(&lo->fd_map); + if (!elem) { + return -1; + } + + elem->fd = fd; +- return elem - lo_data(req)->fd_map.elems; ++ return elem - lo->fd_map.elems; + } + + /* Assumes lo->mutex is held */ +@@ -1712,6 +1712,38 @@ static void update_open_flags(int writeback, struct fuse_file_info *fi) + fi->flags &= ~O_DIRECT; + } + ++static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, ++ struct fuse_file_info *fi) ++{ ++ char buf[64]; ++ ssize_t fh; ++ int fd; ++ ++ update_open_flags(lo->writeback, fi); ++ ++ sprintf(buf, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); ++ if (fd == -1) { ++ return errno; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_fd_mapping(lo, fd); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ close(fd); ++ return ENOMEM; ++ } ++ ++ fi->fh = fh; ++ if (lo->cache == CACHE_NONE) { ++ fi->direct_io = 1; ++ } else if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ return 0; ++} ++ + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) + { +@@ -1752,7 +1784,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + ssize_t fh; + + pthread_mutex_lock(&lo->mutex); +- fh = lo_add_fd_mapping(req, fd); ++ fh = lo_add_fd_mapping(lo, fd); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); +@@ -1943,38 +1975,25 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + + static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { +- int fd; +- ssize_t fh; +- char buf[64]; + struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ int err; + + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, + fi->flags); + +- update_open_flags(lo->writeback, fi); +- +- sprintf(buf, "%i", lo_fd(req, ino)); +- fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); +- if (fd == -1) { +- return (void)fuse_reply_err(req, errno); +- } +- +- pthread_mutex_lock(&lo->mutex); +- fh = lo_add_fd_mapping(req, fd); +- pthread_mutex_unlock(&lo->mutex); +- if (fh == -1) { +- close(fd); +- fuse_reply_err(req, ENOMEM); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); + return; + } + +- fi->fh = fh; +- if (lo->cache == CACHE_NONE) { +- fi->direct_io = 1; +- } else if (lo->cache == CACHE_ALWAYS) { +- fi->keep_cache = 1; ++ err = lo_do_open(lo, inode, fi); ++ lo_inode_put(lo, &inode); ++ if (err) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_open(req, fi); + } +- fuse_reply_open(req, fi); + } + + static void lo_release(fuse_req_t req, fuse_ino_t ino, +-- +2.18.2 + diff --git a/kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch b/kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch new file mode 100755 index 0000000..96f91a1 --- /dev/null +++ b/kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch @@ -0,0 +1,111 @@ +From 983b383bc4a92a9f7ecff0332cadefed2f58f502 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:50 +0100 +Subject: [PATCH 079/116] virtiofsd: extract root inode init into setup_root() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-76-dgilbert@redhat.com> +Patchwork-id: 93527 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 075/112] virtiofsd: extract root inode init into setup_root() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Inititialize the root inode in a single place. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Stefan Hajnoczi +dgilbert: +with fix suggested by Misono Tomohiro +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 3ca8a2b1c83eb185c232a4e87abbb65495263756) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 35 +++++++++++++++++++++++++---------- + 1 file changed, 25 insertions(+), 10 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 33bfb4d..9e7191e 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2351,6 +2351,30 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + } + } + ++static void setup_root(struct lo_data *lo, struct lo_inode *root) ++{ ++ int fd, res; ++ struct stat stat; ++ ++ fd = open("/", O_PATH); ++ if (fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source); ++ exit(1); ++ } ++ ++ res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); ++ exit(1); ++ } ++ ++ root->is_symlink = false; ++ root->fd = fd; ++ root->ino = stat.st_ino; ++ root->dev = stat.st_dev; ++ root->refcount = 2; ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2426,8 +2450,6 @@ int main(int argc, char *argv[]) + if (lo.debug) { + current_log_level = FUSE_LOG_DEBUG; + } +- lo.root.refcount = 2; +- + if (lo.source) { + struct stat stat; + int res; +@@ -2446,7 +2468,6 @@ int main(int argc, char *argv[]) + } else { + lo.source = "/"; + } +- lo.root.is_symlink = false; + if (!lo.timeout_set) { + switch (lo.cache) { + case CACHE_NEVER: +@@ -2466,13 +2487,6 @@ int main(int argc, char *argv[]) + exit(1); + } + +- lo.root.fd = open(lo.source, O_PATH); +- +- if (lo.root.fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", lo.source); +- exit(1); +- } +- + se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); + if (se == NULL) { + goto err_out1; +@@ -2495,6 +2509,7 @@ int main(int argc, char *argv[]) + + setup_sandbox(&lo, se, opts.syslog); + ++ setup_root(&lo, &lo.root); + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch b/kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch new file mode 100755 index 0000000..4860bec --- /dev/null +++ b/kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch @@ -0,0 +1,85 @@ +From b3cd18ab58e331d3610cf00f857d6a945f11a030 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:49 +0100 +Subject: [PATCH 078/116] virtiofsd: fail when parent inode isn't known in + lo_do_lookup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-75-dgilbert@redhat.com> +Patchwork-id: 93529 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 074/112] virtiofsd: fail when parent inode isn't known in lo_do_lookup() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +The Linux file handle APIs (struct export_operations) can access inodes +that are not attached to parents because path name traversal is not +performed. Refuse if there is no parent in lo_do_lookup(). + +Also clean up lo_do_lookup() while we're here. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9de4fab5995d115f8ebfb41d8d94a866d80a1708) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index de12e75..33bfb4d 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -777,6 +777,15 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_data *lo = lo_data(req); + struct lo_inode *inode, *dir = lo_inode(req, parent); + ++ /* ++ * name_to_handle_at() and open_by_handle_at() can reach here with fuse ++ * mount point in guest, but we don't have its inode info in the ++ * ino_map. ++ */ ++ if (!dir) { ++ return ENOENT; ++ } ++ + memset(e, 0, sizeof(*e)); + e->attr_timeout = lo->timeout; + e->entry_timeout = lo->timeout; +@@ -786,7 +795,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + name = "."; + } + +- newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); ++ newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); + if (newfd == -1) { + goto out_err; + } +@@ -796,7 +805,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out_err; + } + +- inode = lo_find(lo_data(req), &e->attr); ++ inode = lo_find(lo, &e->attr); + if (inode) { + close(newfd); + newfd = -1; +@@ -812,6 +821,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + inode->is_symlink = S_ISLNK(e->attr.st_mode); + inode->refcount = 1; + inode->fd = newfd; ++ newfd = -1; + inode->ino = e->attr.st_ino; + inode->dev = e->attr.st_dev; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-error-handling-in-main.patch b/kvm-virtiofsd-fix-error-handling-in-main.patch new file mode 100755 index 0000000..a831992 --- /dev/null +++ b/kvm-virtiofsd-fix-error-handling-in-main.patch @@ -0,0 +1,63 @@ +From 0ea1c7375d6509367399c706eb9d1e8cf79a5830 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:55 +0100 +Subject: [PATCH 084/116] virtiofsd: fix error handling in main() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-81-dgilbert@redhat.com> +Patchwork-id: 93534 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 080/112] virtiofsd: fix error handling in main() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +Neither fuse_parse_cmdline() nor fuse_opt_parse() goes to the right place +to do cleanup. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c6de804670f2255ce776263124c37f3370dc5ac1) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9ed77a1..af050c6 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2443,13 +2443,14 @@ int main(int argc, char *argv[]) + lo_map_init(&lo.fd_map); + + if (fuse_parse_cmdline(&args, &opts) != 0) { +- return 1; ++ goto err_out1; + } + fuse_set_log_func(log_func); + use_syslog = opts.syslog; + if (use_syslog) { + openlog("virtiofsd", LOG_PID, LOG_DAEMON); + } ++ + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); +@@ -2468,7 +2469,7 @@ int main(int argc, char *argv[]) + } + + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { +- return 1; ++ goto err_out1; + } + + /* +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch b/kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch new file mode 100755 index 0000000..420a8a6 --- /dev/null +++ b/kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch @@ -0,0 +1,44 @@ +From 9c291ca8624318613ede6e4174d08cf45aae8384 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:01 +0100 +Subject: [PATCH 090/116] virtiofsd: fix incorrect error handling in + lo_do_lookup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-87-dgilbert@redhat.com> +Patchwork-id: 93543 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 086/112] virtiofsd: fix incorrect error handling in lo_do_lookup +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eric Ren + +Signed-off-by: Eric Ren +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit fc3f0041b43b6c64aa97b3558a6abe1a10028354) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e8dc5c7..05b5f89 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -814,7 +814,6 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + close(newfd); + newfd = -1; + } else { +- saverr = ENOMEM; + inode = calloc(1, sizeof(struct lo_inode)); + if (!inode) { + goto out_err; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-libfuse-information-leaks.patch b/kvm-virtiofsd-fix-libfuse-information-leaks.patch new file mode 100755 index 0000000..90debb0 --- /dev/null +++ b/kvm-virtiofsd-fix-libfuse-information-leaks.patch @@ -0,0 +1,322 @@ +From e0d64e481e5a9fab5ff90d2a8f84afcd3311d13b Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:35 +0100 +Subject: [PATCH 064/116] virtiofsd: fix libfuse information leaks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-61-dgilbert@redhat.com> +Patchwork-id: 93515 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 060/112] virtiofsd: fix libfuse information leaks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Some FUSE message replies contain padding fields that are not +initialized by libfuse. This is fine in traditional FUSE applications +because the kernel is trusted. virtiofsd does not trust the guest and +must not expose uninitialized memory. + +Use C struct initializers to automatically zero out memory. Not all of +these code changes are strictly necessary but they will prevent future +information leaks if the structs are extended. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 3db2876a0153ac7103c077c53090e020faffb3ea) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 150 ++++++++++++++++++++-------------------- + 1 file changed, 76 insertions(+), 74 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 2d6dc5a..6ceb33d 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -44,21 +44,23 @@ static __attribute__((constructor)) void fuse_ll_init_pagesize(void) + + static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) + { +- attr->ino = stbuf->st_ino; +- attr->mode = stbuf->st_mode; +- attr->nlink = stbuf->st_nlink; +- attr->uid = stbuf->st_uid; +- attr->gid = stbuf->st_gid; +- attr->rdev = stbuf->st_rdev; +- attr->size = stbuf->st_size; +- attr->blksize = stbuf->st_blksize; +- attr->blocks = stbuf->st_blocks; +- attr->atime = stbuf->st_atime; +- attr->mtime = stbuf->st_mtime; +- attr->ctime = stbuf->st_ctime; +- attr->atimensec = ST_ATIM_NSEC(stbuf); +- attr->mtimensec = ST_MTIM_NSEC(stbuf); +- attr->ctimensec = ST_CTIM_NSEC(stbuf); ++ *attr = (struct fuse_attr){ ++ .ino = stbuf->st_ino, ++ .mode = stbuf->st_mode, ++ .nlink = stbuf->st_nlink, ++ .uid = stbuf->st_uid, ++ .gid = stbuf->st_gid, ++ .rdev = stbuf->st_rdev, ++ .size = stbuf->st_size, ++ .blksize = stbuf->st_blksize, ++ .blocks = stbuf->st_blocks, ++ .atime = stbuf->st_atime, ++ .mtime = stbuf->st_mtime, ++ .ctime = stbuf->st_ctime, ++ .atimensec = ST_ATIM_NSEC(stbuf), ++ .mtimensec = ST_MTIM_NSEC(stbuf), ++ .ctimensec = ST_CTIM_NSEC(stbuf), ++ }; + } + + static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) +@@ -183,16 +185,16 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out = { ++ .unique = req->unique, ++ .error = error, ++ }; + + if (error <= -1000 || error > 0) { + fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); + error = -ERANGE; + } + +- out.unique = req->unique; +- out.error = error; +- + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + +@@ -277,14 +279,16 @@ size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + static void convert_statfs(const struct statvfs *stbuf, + struct fuse_kstatfs *kstatfs) + { +- kstatfs->bsize = stbuf->f_bsize; +- kstatfs->frsize = stbuf->f_frsize; +- kstatfs->blocks = stbuf->f_blocks; +- kstatfs->bfree = stbuf->f_bfree; +- kstatfs->bavail = stbuf->f_bavail; +- kstatfs->files = stbuf->f_files; +- kstatfs->ffree = stbuf->f_ffree; +- kstatfs->namelen = stbuf->f_namemax; ++ *kstatfs = (struct fuse_kstatfs){ ++ .bsize = stbuf->f_bsize, ++ .frsize = stbuf->f_frsize, ++ .blocks = stbuf->f_blocks, ++ .bfree = stbuf->f_bfree, ++ .bavail = stbuf->f_bavail, ++ .files = stbuf->f_files, ++ .ffree = stbuf->f_ffree, ++ .namelen = stbuf->f_namemax, ++ }; + } + + static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) +@@ -328,12 +332,14 @@ static unsigned int calc_timeout_nsec(double t) + static void fill_entry(struct fuse_entry_out *arg, + const struct fuse_entry_param *e) + { +- arg->nodeid = e->ino; +- arg->generation = e->generation; +- arg->entry_valid = calc_timeout_sec(e->entry_timeout); +- arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); +- arg->attr_valid = calc_timeout_sec(e->attr_timeout); +- arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); ++ *arg = (struct fuse_entry_out){ ++ .nodeid = e->ino, ++ .generation = e->generation, ++ .entry_valid = calc_timeout_sec(e->entry_timeout), ++ .entry_valid_nsec = calc_timeout_nsec(e->entry_timeout), ++ .attr_valid = calc_timeout_sec(e->attr_timeout), ++ .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout), ++ }; + convert_stat(&e->attr, &arg->attr); + } + +@@ -362,10 +368,12 @@ size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + fill_entry(&dp->entry_out, e); + + struct fuse_dirent *dirent = &dp->dirent; +- dirent->ino = e->attr.st_ino; +- dirent->off = off; +- dirent->namelen = namelen; +- dirent->type = (e->attr.st_mode & S_IFMT) >> 12; ++ *dirent = (struct fuse_dirent){ ++ .ino = e->attr.st_ino, ++ .off = off, ++ .namelen = namelen, ++ .type = (e->attr.st_mode & S_IFMT) >> 12, ++ }; + memcpy(dirent->name, name, namelen); + memset(dirent->name + namelen, 0, entlen_padded - entlen); + +@@ -496,15 +504,14 @@ static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) + { + struct iovec iov[2]; +- struct fuse_out_header out; ++ struct fuse_out_header out = { ++ .unique = req->unique, ++ }; + int res; + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + +- out.unique = req->unique; +- out.error = 0; +- + res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv); + if (res <= 0) { + fuse_free_req(req); +@@ -2145,14 +2152,14 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, + static int send_notify_iov(struct fuse_session *se, int notify_code, + struct iovec *iov, int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out = { ++ .error = notify_code, ++ }; + + if (!se->got_init) { + return -ENOTCONN; + } + +- out.unique = 0; +- out.error = notify_code; + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + +@@ -2162,11 +2169,11 @@ static int send_notify_iov(struct fuse_session *se, int notify_code, + int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) + { + if (ph != NULL) { +- struct fuse_notify_poll_wakeup_out outarg; ++ struct fuse_notify_poll_wakeup_out outarg = { ++ .kh = ph->kh, ++ }; + struct iovec iov[2]; + +- outarg.kh = ph->kh; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + +@@ -2179,17 +2186,17 @@ int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) + int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + off_t off, off_t len) + { +- struct fuse_notify_inval_inode_out outarg; ++ struct fuse_notify_inval_inode_out outarg = { ++ .ino = ino, ++ .off = off, ++ .len = len, ++ }; + struct iovec iov[2]; + + if (!se) { + return -EINVAL; + } + +- outarg.ino = ino; +- outarg.off = off; +- outarg.len = len; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + +@@ -2199,17 +2206,16 @@ int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + const char *name, size_t namelen) + { +- struct fuse_notify_inval_entry_out outarg; ++ struct fuse_notify_inval_entry_out outarg = { ++ .parent = parent, ++ .namelen = namelen, ++ }; + struct iovec iov[3]; + + if (!se) { + return -EINVAL; + } + +- outarg.parent = parent; +- outarg.namelen = namelen; +- outarg.padding = 0; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + iov[2].iov_base = (void *)name; +@@ -2222,18 +2228,17 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + fuse_ino_t child, const char *name, + size_t namelen) + { +- struct fuse_notify_delete_out outarg; ++ struct fuse_notify_delete_out outarg = { ++ .parent = parent, ++ .child = child, ++ .namelen = namelen, ++ }; + struct iovec iov[3]; + + if (!se) { + return -EINVAL; + } + +- outarg.parent = parent; +- outarg.child = child; +- outarg.namelen = namelen; +- outarg.padding = 0; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + iov[2].iov_base = (void *)name; +@@ -2245,24 +2250,21 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv) + { +- struct fuse_out_header out; +- struct fuse_notify_store_out outarg; ++ struct fuse_out_header out = { ++ .error = FUSE_NOTIFY_STORE, ++ }; ++ struct fuse_notify_store_out outarg = { ++ .nodeid = ino, ++ .offset = offset, ++ .size = fuse_buf_size(bufv), ++ }; + struct iovec iov[3]; +- size_t size = fuse_buf_size(bufv); + int res; + + if (!se) { + return -EINVAL; + } + +- out.unique = 0; +- out.error = FUSE_NOTIFY_STORE; +- +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; +- + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(out); + iov[1].iov_base = &outarg; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch b/kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch new file mode 100755 index 0000000..6243037 --- /dev/null +++ b/kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch @@ -0,0 +1,94 @@ +From 9a44d78f5019280b006bb5b3de7164336289d639 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:21 +0100 +Subject: [PATCH 110/116] virtiofsd: fix lo_destroy() resource leaks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-107-dgilbert@redhat.com> +Patchwork-id: 93560 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 106/112] virtiofsd: fix lo_destroy() resource leaks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Now that lo_destroy() is serialized we can call unref_inode() so that +all inode resources are freed. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 28f7a3b026f231bfe8de5fed6a18a8d27b1dfcee) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 41 ++++++++++++++++++++-------------------- + 1 file changed, 20 insertions(+), 21 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 79b8b71..eb001b9 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1371,26 +1371,6 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + } + +-static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) +-{ +- struct lo_inode *inode = value; +- struct lo_data *lo = user_data; +- +- inode->nlookup = 0; +- lo_map_remove(&lo->ino_map, inode->fuse_ino); +- close(inode->fd); +- lo_inode_put(lo, &inode); /* Drop our refcount from lo_do_lookup() */ +- +- return TRUE; +-} +- +-static void unref_all_inodes(struct lo_data *lo) +-{ +- pthread_mutex_lock(&lo->mutex); +- g_hash_table_foreach_remove(lo->inodes, unref_all_inodes_cb, lo); +- pthread_mutex_unlock(&lo->mutex); +-} +- + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +@@ -2477,7 +2457,26 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + static void lo_destroy(void *userdata) + { + struct lo_data *lo = (struct lo_data *)userdata; +- unref_all_inodes(lo); ++ ++ /* ++ * Normally lo->mutex must be taken when traversing lo->inodes but ++ * lo_destroy() is a serialized request so no races are possible here. ++ * ++ * In addition, we cannot acquire lo->mutex since unref_inode() takes it ++ * too and this would result in a recursive lock. ++ */ ++ while (true) { ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, lo->inodes); ++ if (!g_hash_table_iter_next(&iter, &key, &value)) { ++ break; ++ } ++ ++ struct lo_inode *inode = value; ++ unref_inode_lolocked(lo, inode, inode->nlookup); ++ } + } + + static struct fuse_lowlevel_ops lo_oper = { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-memory-leak-on-lo.source.patch b/kvm-virtiofsd-fix-memory-leak-on-lo.source.patch new file mode 100755 index 0000000..4d7d6dc --- /dev/null +++ b/kvm-virtiofsd-fix-memory-leak-on-lo.source.patch @@ -0,0 +1,66 @@ +From 9e0f5b64f30c2f841f297e25c2f3a6d82c8a16b8 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:57 +0100 +Subject: [PATCH 086/116] virtiofsd: fix memory leak on lo.source +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-83-dgilbert@redhat.com> +Patchwork-id: 93536 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 082/112] virtiofsd: fix memory leak on lo.source +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +valgrind reported that lo.source is leaked on quiting, but it was defined +as (const char*) as it may point to a const string "/". + +Signed-off-by: Liu Bo +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit eb68a33b5fc5dde87bd9b99b94e7c33a5d8ea82e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index af050c6..056ebe8 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -115,7 +115,7 @@ struct lo_data { + int writeback; + int flock; + int xattr; +- const char *source; ++ char *source; + double timeout; + int cache; + int timeout_set; +@@ -2497,9 +2497,8 @@ int main(int argc, char *argv[]) + fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); + exit(1); + } +- + } else { +- lo.source = "/"; ++ lo.source = strdup("/"); + } + if (!lo.timeout_set) { + switch (lo.cache) { +@@ -2570,5 +2569,7 @@ err_out1: + close(lo.root.fd); + } + ++ free(lo.source); ++ + return ret ? 1 : 0; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch b/kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch new file mode 100755 index 0000000..b17d93c --- /dev/null +++ b/kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch @@ -0,0 +1,56 @@ +From 3b6461ee08654b2cbb6d4e0cc15c02f89a6610d5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:09 +0000 +Subject: [PATCH 13/18] virtiofsd: fv_create_listen_socket error path socket + leak +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-3-dgilbert@redhat.com> +Patchwork-id: 94124 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/7] virtiofsd: fv_create_listen_socket error path socket leak +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +If we fail when bringing up the socket we can leak the listen_fd; +in practice the daemon will exit so it's not really a problem. + +Fixes: Coverity CID 1413121 +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 6fa249027f97e3080f3d9c0fab3f94f8f80828fe) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_virtio.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 80a6e92..dd1c605 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -916,6 +916,7 @@ static int fv_create_listen_socket(struct fuse_session *se) + old_umask = umask(0077); + if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); ++ close(listen_sock); + umask(old_umask); + return -1; + } +@@ -923,6 +924,7 @@ static int fv_create_listen_socket(struct fuse_session *se) + + if (listen(listen_sock, 1) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n"); ++ close(listen_sock); + return -1; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-get-set-features-callbacks.patch b/kvm-virtiofsd-get-set-features-callbacks.patch new file mode 100755 index 0000000..fcb5ca2 --- /dev/null +++ b/kvm-virtiofsd-get-set-features-callbacks.patch @@ -0,0 +1,66 @@ +From 59bfe3ad924d00dc9c7a4363fcd3db36ea247988 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:59 +0100 +Subject: [PATCH 028/116] virtiofsd: get/set features callbacks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-25-dgilbert@redhat.com> +Patchwork-id: 93478 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 024/112] virtiofsd: get/set features callbacks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Add the get/set features callbacks. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f2cef5fb9ae20136ca18d16328787b69b3abfa18) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 1928a20..4819e56 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -46,6 +46,17 @@ struct virtio_fs_config { + uint32_t num_queues; + }; + ++/* Callback from libvhost-user */ ++static uint64_t fv_get_features(VuDev *dev) ++{ ++ return 1ULL << VIRTIO_F_VERSION_1; ++} ++ ++/* Callback from libvhost-user */ ++static void fv_set_features(VuDev *dev, uint64_t features) ++{ ++} ++ + /* + * Callback from libvhost-user if there's a new fd we're supposed to listen + * to, typically a queue kick? +@@ -78,7 +89,9 @@ static bool fv_queue_order(VuDev *dev, int qidx) + } + + static const VuDevIface fv_iface = { +- /* TODO: Add other callbacks */ ++ .get_features = fv_get_features, ++ .set_features = fv_set_features, ++ + .queue_is_processed_in_order = fv_queue_order, + }; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch b/kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch new file mode 100755 index 0000000..68d20e7 --- /dev/null +++ b/kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch @@ -0,0 +1,589 @@ +From da6ee5c24397d2ca93dfaf275fdd9dafc922da15 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:11 +0100 +Subject: [PATCH 100/116] virtiofsd: introduce inode refcount to prevent + use-after-free +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-97-dgilbert@redhat.com> +Patchwork-id: 93550 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 096/112] virtiofsd: introduce inode refcount to prevent use-after-free +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +If thread A is using an inode it must not be deleted by thread B when +processing a FUSE_FORGET request. + +The FUSE protocol itself already has a counter called nlookup that is +used in FUSE_FORGET messages. We cannot trust this counter since the +untrusted client can manipulate it via FUSE_FORGET messages. + +Introduce a new refcount to keep inodes alive for the required lifespan. +lo_inode_put() must be called to release a reference. FUSE's nlookup +counter holds exactly one reference so that the inode stays alive as +long as the client still wants to remember it. + +Note that the lo_inode->is_symlink field is moved to avoid creating a +hole in the struct due to struct field alignment. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c241aa9457d88c6a0d027f48fadfed131646bce3) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 169 +++++++++++++++++++++++++++++++++------ + 1 file changed, 146 insertions(+), 23 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e3a6d6b..ab16135 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -97,7 +97,13 @@ struct lo_key { + + struct lo_inode { + int fd; +- bool is_symlink; ++ ++ /* ++ * Atomic reference count for this object. The nlookup field holds a ++ * reference and release it when nlookup reaches 0. ++ */ ++ gint refcount; ++ + struct lo_key key; + + /* +@@ -116,6 +122,8 @@ struct lo_inode { + fuse_ino_t fuse_ino; + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ ++ ++ bool is_symlink; + }; + + struct lo_cred { +@@ -471,6 +479,23 @@ static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) + return elem - lo_data(req)->ino_map.elems; + } + ++static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep) ++{ ++ struct lo_inode *inode = *inodep; ++ ++ if (!inode) { ++ return; ++ } ++ ++ *inodep = NULL; ++ ++ if (g_atomic_int_dec_and_test(&inode->refcount)) { ++ close(inode->fd); ++ free(inode); ++ } ++} ++ ++/* Caller must release refcount using lo_inode_put() */ + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { + struct lo_data *lo = lo_data(req); +@@ -478,6 +503,9 @@ static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->ino_map, ino); ++ if (elem) { ++ g_atomic_int_inc(&elem->inode->refcount); ++ } + pthread_mutex_unlock(&lo->mutex); + + if (!elem) { +@@ -487,10 +515,23 @@ static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + return elem->inode; + } + ++/* ++ * TODO Remove this helper and force callers to hold an inode refcount until ++ * they are done with the fd. This will be done in a later patch to make ++ * review easier. ++ */ + static int lo_fd(fuse_req_t req, fuse_ino_t ino) + { + struct lo_inode *inode = lo_inode(req, ino); +- return inode ? inode->fd : -1; ++ int fd; ++ ++ if (!inode) { ++ return -1; ++ } ++ ++ fd = inode->fd; ++ lo_inode_put(lo_data(req), &inode); ++ return fd; + } + + static void lo_init(void *userdata, struct fuse_conn_info *conn) +@@ -545,6 +586,10 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino, + fuse_reply_attr(req, &buf, lo->timeout); + } + ++/* ++ * Increments parent->nlookup and caller must release refcount using ++ * lo_inode_put(&parent). ++ */ + static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, + char path[PATH_MAX], struct lo_inode **parent) + { +@@ -582,6 +627,7 @@ retry: + p = &lo->root; + pthread_mutex_lock(&lo->mutex); + p->nlookup++; ++ g_atomic_int_inc(&p->refcount); + pthread_mutex_unlock(&lo->mutex); + } else { + *last = '\0'; +@@ -625,6 +671,7 @@ retry: + + fail_unref: + unref_inode_lolocked(lo, p, 1); ++ lo_inode_put(lo, &p); + fail: + if (retries) { + retries--; +@@ -663,6 +710,7 @@ fallback: + if (res != -1) { + res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); + unref_inode_lolocked(lo, parent, 1); ++ lo_inode_put(lo, &parent); + } + + return res; +@@ -780,11 +828,13 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + goto out_err; + } + } ++ lo_inode_put(lo, &inode); + + return lo_getattr(req, ino, fi); + + out_err: + saverr = errno; ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + +@@ -801,6 +851,7 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + if (p) { + assert(p->nlookup > 0); + p->nlookup++; ++ g_atomic_int_inc(&p->refcount); + } + pthread_mutex_unlock(&lo->mutex); + +@@ -820,6 +871,10 @@ static void posix_locks_value_destroy(gpointer data) + free(plock); + } + ++/* ++ * Increments nlookup and caller must release refcount using ++ * lo_inode_put(&parent). ++ */ + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct fuse_entry_param *e) + { +@@ -827,7 +882,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + int res; + int saverr; + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode, *dir = lo_inode(req, parent); ++ struct lo_inode *inode = NULL; ++ struct lo_inode *dir = lo_inode(req, parent); + + /* + * name_to_handle_at() and open_by_handle_at() can reach here with fuse +@@ -868,6 +924,13 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ ++ /* ++ * One for the caller and one for nlookup (released in ++ * unref_inode_lolocked()) ++ */ ++ g_atomic_int_set(&inode->refcount, 2); ++ + inode->nlookup = 1; + inode->fd = newfd; + newfd = -1; +@@ -883,6 +946,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; ++ lo_inode_put(lo, &inode); ++ lo_inode_put(lo, &dir); + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e->ino); +@@ -894,6 +959,8 @@ out_err: + if (newfd != -1) { + close(newfd); + } ++ lo_inode_put(lo, &inode); ++ lo_inode_put(lo, &dir); + return saverr; + } + +@@ -991,6 +1058,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + { + int res; + int saverr; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *dir; + struct fuse_entry_param e; + struct lo_cred old = {}; +@@ -1032,9 +1100,11 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); ++ lo_inode_put(lo, &dir); + return; + + out: ++ lo_inode_put(lo, &dir); + fuse_reply_err(req, saverr); + } + +@@ -1085,6 +1155,7 @@ fallback: + if (res != -1) { + res = linkat(parent->fd, path, dfd, name, 0); + unref_inode_lolocked(lo, parent, 1); ++ lo_inode_put(lo, &parent); + } + + return res; +@@ -1095,6 +1166,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + { + int res; + struct lo_data *lo = lo_data(req); ++ struct lo_inode *parent_inode; + struct lo_inode *inode; + struct fuse_entry_param e; + int saverr; +@@ -1104,17 +1176,18 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + return; + } + ++ parent_inode = lo_inode(req, parent); + inode = lo_inode(req, ino); +- if (!inode) { +- fuse_reply_err(req, EBADF); +- return; ++ if (!parent_inode || !inode) { ++ errno = EBADF; ++ goto out_err; + } + + memset(&e, 0, sizeof(struct fuse_entry_param)); + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; + +- res = linkat_empty_nofollow(lo, inode, lo_fd(req, parent), name); ++ res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name); + if (res == -1) { + goto out_err; + } +@@ -1133,13 +1206,18 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); ++ lo_inode_put(lo, &parent_inode); ++ lo_inode_put(lo, &inode); + return; + + out_err: + saverr = errno; ++ lo_inode_put(lo, &parent_inode); ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + ++/* Increments nlookup and caller must release refcount using lo_inode_put() */ + static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, + const char *name) + { +@@ -1176,6 +1254,7 @@ static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); ++ lo_inode_put(lo, &inode); + } + + static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, +@@ -1183,8 +1262,10 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + unsigned int flags) + { + int res; +- struct lo_inode *oldinode; +- struct lo_inode *newinode; ++ struct lo_inode *parent_inode; ++ struct lo_inode *newparent_inode; ++ struct lo_inode *oldinode = NULL; ++ struct lo_inode *newinode = NULL; + struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { +@@ -1192,6 +1273,13 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + return; + } + ++ parent_inode = lo_inode(req, parent); ++ newparent_inode = lo_inode(req, newparent); ++ if (!parent_inode || !newparent_inode) { ++ fuse_reply_err(req, EBADF); ++ goto out; ++ } ++ + oldinode = lookup_name(req, parent, name); + newinode = lookup_name(req, newparent, newname); + +@@ -1204,8 +1292,8 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + #ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); + #else +- res = syscall(SYS_renameat2, lo_fd(req, parent), name, +- lo_fd(req, newparent), newname, flags); ++ res = syscall(SYS_renameat2, parent_inode->fd, name, ++ newparent_inode->fd, newname, flags); + if (res == -1 && errno == ENOSYS) { + fuse_reply_err(req, EINVAL); + } else { +@@ -1215,12 +1303,16 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out; + } + +- res = renameat(lo_fd(req, parent), name, lo_fd(req, newparent), newname); ++ res = renameat(parent_inode->fd, name, newparent_inode->fd, newname); + + fuse_reply_err(req, res == -1 ? errno : 0); + out: + unref_inode_lolocked(lo, oldinode, 1); + unref_inode_lolocked(lo, newinode, 1); ++ lo_inode_put(lo, &oldinode); ++ lo_inode_put(lo, &newinode); ++ lo_inode_put(lo, &parent_inode); ++ lo_inode_put(lo, &newparent_inode); + } + + static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) +@@ -1244,6 +1336,7 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); ++ lo_inode_put(lo, &inode); + } + + static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, +@@ -1265,8 +1358,9 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + g_hash_table_destroy(inode->posix_locks); + pthread_mutex_destroy(&inode->plock_mutex); + pthread_mutex_unlock(&lo->mutex); +- close(inode->fd); +- free(inode); ++ ++ /* Drop our refcount from lo_do_lookup() */ ++ lo_inode_put(lo, &inode); + } else { + pthread_mutex_unlock(&lo->mutex); + } +@@ -1280,6 +1374,7 @@ static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) + inode->nlookup = 0; + lo_map_remove(&lo->ino_map, inode->fuse_ino); + close(inode->fd); ++ lo_inode_put(lo, &inode); /* Drop our refcount from lo_do_lookup() */ + + return TRUE; + } +@@ -1306,6 +1401,7 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + (unsigned long long)nlookup); + + unref_inode_lolocked(lo, inode, nlookup); ++ lo_inode_put(lo, &inode); + } + + static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +@@ -1537,6 +1633,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + err = 0; + error: + lo_dirp_put(&d); ++ lo_inode_put(lo, &dinode); + + /* + * If there's an error, we can only signal it if we haven't stored +@@ -1595,6 +1692,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + { + int fd; + struct lo_data *lo = lo_data(req); ++ struct lo_inode *parent_inode; + struct fuse_entry_param e; + int err; + struct lo_cred old = {}; +@@ -1607,12 +1705,18 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + return; + } + ++ parent_inode = lo_inode(req, parent); ++ if (!parent_inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + err = lo_change_cred(req, &old); + if (err) { + goto out; + } + +- fd = openat(lo_fd(req, parent), name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, ++ fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); + err = fd == -1 ? errno : 0; + lo_restore_cred(&old); +@@ -1625,8 +1729,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); +- fuse_reply_err(req, ENOMEM); +- return; ++ err = ENOMEM; ++ goto out; + } + + fi->fh = fh; +@@ -1639,6 +1743,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + out: ++ lo_inode_put(lo, &parent_inode); ++ + if (err) { + fuse_reply_err(req, err); + } else { +@@ -1712,16 +1818,18 @@ static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + plock = + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + if (!plock) { +- pthread_mutex_unlock(&inode->plock_mutex); +- fuse_reply_err(req, ret); +- return; ++ saverr = ret; ++ goto out; + } + + ret = fcntl(plock->fd, F_OFD_GETLK, lock); + if (ret == -1) { + saverr = errno; + } ++ ++out: + pthread_mutex_unlock(&inode->plock_mutex); ++ lo_inode_put(lo, &inode); + + if (saverr) { + fuse_reply_err(req, saverr); +@@ -1761,9 +1869,8 @@ static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + + if (!plock) { +- pthread_mutex_unlock(&inode->plock_mutex); +- fuse_reply_err(req, ret); +- return; ++ saverr = ret; ++ goto out; + } + + /* TODO: Is it alright to modify flock? */ +@@ -1772,7 +1879,11 @@ static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + if (ret == -1) { + saverr = errno; + } ++ ++out: + pthread_mutex_unlock(&inode->plock_mutex); ++ lo_inode_put(lo, &inode); ++ + fuse_reply_err(req, saverr); + } + +@@ -1898,6 +2009,7 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + pthread_mutex_unlock(&inode->plock_mutex); + + res = close(dup(lo_fi_fd(req, fi))); ++ lo_inode_put(lo_data(req), &inode); + fuse_reply_err(req, res == -1 ? errno : 0); + } + +@@ -2115,11 +2227,14 @@ out_free: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + return; + + out_err: + saverr = errno; + out: ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +@@ -2190,11 +2305,14 @@ out_free: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + return; + + out_err: + saverr = errno; + out: ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +@@ -2243,6 +2361,8 @@ out: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + +@@ -2289,6 +2409,8 @@ out: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + +@@ -2671,6 +2793,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; + root->nlookup = 2; ++ g_atomic_int_set(&root->refcount, 2); + } + + static guint lo_key_hash(gconstpointer key) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-jail-lo-proc_self_fd.patch b/kvm-virtiofsd-jail-lo-proc_self_fd.patch new file mode 100755 index 0000000..df69242 --- /dev/null +++ b/kvm-virtiofsd-jail-lo-proc_self_fd.patch @@ -0,0 +1,85 @@ +From 852a0a22d674b0594aecf0912a0885d197f34978 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:57 +0100 +Subject: [PATCH 6/9] virtiofsd: jail lo->proc_self_fd + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-5-dgilbert@redhat.com> +Patchwork-id: 96275 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 4/7] virtiofsd: jail lo->proc_self_fd +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Miklos Szeredi + +While it's not possible to escape the proc filesystem through +lo->proc_self_fd, it is possible to escape to the root of the proc +filesystem itself through "../..". + +Use a temporary mount for opening lo->proc_self_fd, that has it's root at +/proc/self/fd/, preventing access to the ancestor directories. + +Signed-off-by: Miklos Szeredi +Message-Id: <20200429124733.22488-1-mszeredi@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 397ae982f4df46e7d4b2625c431062c9146f3b83) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 27 +++++++++++++++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 184ad0f..73d8405 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2540,6 +2540,8 @@ static void print_capabilities(void) + static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + { + pid_t child; ++ char template[] = "virtiofsd-XXXXXX"; ++ char *tmpdir; + + /* + * Create a new pid namespace for *child* processes. We'll have to +@@ -2601,12 +2603,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + exit(1); + } + ++ tmpdir = mkdtemp(template); ++ if (!tmpdir) { ++ fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template); ++ exit(1); ++ } ++ ++ if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n", ++ tmpdir); ++ exit(1); ++ } ++ + /* Now we can get our /proc/self/fd directory file descriptor */ +- lo->proc_self_fd = open("/proc/self/fd", O_PATH); ++ lo->proc_self_fd = open(tmpdir, O_PATH); + if (lo->proc_self_fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); ++ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir); + exit(1); + } ++ ++ if (umount2(tmpdir, MNT_DETACH) < 0) { ++ fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir); ++ exit(1); ++ } ++ ++ if (rmdir(tmpdir) < 0) { ++ fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir); ++ } + } + + /* +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-load_capng-missing-unlock.patch b/kvm-virtiofsd-load_capng-missing-unlock.patch new file mode 100755 index 0000000..bc04f6b --- /dev/null +++ b/kvm-virtiofsd-load_capng-missing-unlock.patch @@ -0,0 +1,46 @@ +From ece7649025fbdbde48ff0b954e8ec2e42c4a8b3d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:10 +0000 +Subject: [PATCH 14/18] virtiofsd: load_capng missing unlock +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-4-dgilbert@redhat.com> +Patchwork-id: 94126 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/7] virtiofsd: load_capng missing unlock +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +Missing unlock in error path. + +Fixes: Covertiy CID 1413123 +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 686391112fd42c615bcc4233472887a66a9b5a4a) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e6f2399..c635fc8 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -232,6 +232,7 @@ static int load_capng(void) + */ + cap.saved = capng_save_state(); + if (!cap.saved) { ++ pthread_mutex_unlock(&cap.mutex); + fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); + return -EINVAL; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-make-f-foreground-the-default.patch b/kvm-virtiofsd-make-f-foreground-the-default.patch new file mode 100755 index 0000000..d6cb0e3 --- /dev/null +++ b/kvm-virtiofsd-make-f-foreground-the-default.patch @@ -0,0 +1,76 @@ +From 7f2e1f79a3addb242c3018c7a80e2e57589119f0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:08 +0100 +Subject: [PATCH 037/116] virtiofsd: make -f (foreground) the default +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-34-dgilbert@redhat.com> +Patchwork-id: 93489 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 033/112] virtiofsd: make -f (foreground) the default +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +According to vhost-user.rst "Backend program conventions", backend +programs should run in the foregound by default. Follow the +conventions so libvirt and other management tools can control virtiofsd +in a standard way. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0bbd31753714ac2899efda0f0de31e353e965789) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 676032e..a3645fc 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -29,6 +29,11 @@ + { \ + t, offsetof(struct fuse_cmdline_opts, p), 1 \ + } ++#define FUSE_HELPER_OPT_VALUE(t, p, v) \ ++ { \ ++ t, offsetof(struct fuse_cmdline_opts, p), v \ ++ } ++ + + static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("-h", show_help), +@@ -42,6 +47,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), + FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("-f", foreground), ++ FUSE_HELPER_OPT_VALUE("--daemonize", foreground, 0), + FUSE_HELPER_OPT("fsname=", nodefault_subtype), + FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("subtype=", nodefault_subtype), +@@ -131,6 +137,7 @@ void fuse_cmdline_help(void) + " -V --version print version\n" + " -d -o debug enable debug output (implies -f)\n" + " -f foreground operation\n" ++ " --daemonize run in background\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" + " allowed (default: 10)\n"); +@@ -158,6 +165,7 @@ int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + + opts->max_idle_threads = 10; ++ opts->foreground = 1; + + if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == + -1) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-make-lo_release-atomic.patch b/kvm-virtiofsd-make-lo_release-atomic.patch new file mode 100755 index 0000000..6d88549 --- /dev/null +++ b/kvm-virtiofsd-make-lo_release-atomic.patch @@ -0,0 +1,62 @@ +From 4ebabb66f4132186152edf8e1907fce436bf5c69 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:06 +0100 +Subject: [PATCH 095/116] virtiofsd: make lo_release() atomic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-92-dgilbert@redhat.com> +Patchwork-id: 93545 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 091/112] virtiofsd: make lo_release() atomic +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Hold the lock across both lo_map_get() and lo_map_remove() to prevent +races between two FUSE_RELEASE requests. In this case I don't see a +serious bug but it's safer to do things atomically. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit baed65c060c0e524530bc243eec427fb408bd477) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9414935..690edbc 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1772,14 +1772,18 @@ static void lo_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { + struct lo_data *lo = lo_data(req); +- int fd; ++ struct lo_map_elem *elem; ++ int fd = -1; + + (void)ino; + +- fd = lo_fi_fd(req, fi); +- + pthread_mutex_lock(&lo->mutex); +- lo_map_remove(&lo->fd_map, fi->fh); ++ elem = lo_map_get(&lo->fd_map, fi->fh); ++ if (elem) { ++ fd = elem->fd; ++ elem = NULL; ++ lo_map_remove(&lo->fd_map, fi->fh); ++ } + pthread_mutex_unlock(&lo->mutex); + + close(fd); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-move-to-a-new-pid-namespace.patch b/kvm-virtiofsd-move-to-a-new-pid-namespace.patch new file mode 100755 index 0000000..9a33d1b --- /dev/null +++ b/kvm-virtiofsd-move-to-a-new-pid-namespace.patch @@ -0,0 +1,223 @@ +From a7a87a751a9893830d031a957a751b7622b71fb2 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:29 +0100 +Subject: [PATCH 058/116] virtiofsd: move to a new pid namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-55-dgilbert@redhat.com> +Patchwork-id: 93510 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 054/112] virtiofsd: move to a new pid namespace +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +virtiofsd needs access to /proc/self/fd. Let's move to a new pid +namespace so that a compromised process cannot see another other +processes running on the system. + +One wrinkle in this approach: unshare(CLONE_NEWPID) affects *child* +processes and not the current process. Therefore we need to fork the +pid 1 process that will actually run virtiofsd and leave a parent in +waitpid(2). This is not the same thing as daemonization and parent +processes should not notice a difference. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8e1d4ef231d8327be219f7aea7aa15d181375bbc) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 134 +++++++++++++++++++++++++-------------- + 1 file changed, 86 insertions(+), 48 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 27ab328..0947d14 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -51,7 +51,10 @@ + #include + #include + #include ++#include + #include ++#include ++#include + #include + #include + +@@ -1945,24 +1948,95 @@ static void print_capabilities(void) + } + + /* +- * Called after our UNIX domain sockets have been created, now we can move to +- * an empty network namespace to prevent TCP/IP and other network activity in +- * case this process is compromised. ++ * Move to a new mount, net, and pid namespaces to isolate this process. + */ +-static void setup_net_namespace(void) ++static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + { +- if (unshare(CLONE_NEWNET) != 0) { +- fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNET): %m\n"); ++ pid_t child; ++ ++ /* ++ * Create a new pid namespace for *child* processes. We'll have to ++ * fork in order to enter the new pid namespace. A new mount namespace ++ * is also needed so that we can remount /proc for the new pid ++ * namespace. ++ * ++ * Our UNIX domain sockets have been created. Now we can move to ++ * an empty network namespace to prevent TCP/IP and other network ++ * activity in case this process is compromised. ++ */ ++ if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) { ++ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n"); ++ exit(1); ++ } ++ ++ child = fork(); ++ if (child < 0) { ++ fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n"); ++ exit(1); ++ } ++ if (child > 0) { ++ pid_t waited; ++ int wstatus; ++ ++ /* The parent waits for the child */ ++ do { ++ waited = waitpid(child, &wstatus, 0); ++ } while (waited < 0 && errno == EINTR && !se->exited); ++ ++ /* We were terminated by a signal, see fuse_signals.c */ ++ if (se->exited) { ++ exit(0); ++ } ++ ++ if (WIFEXITED(wstatus)) { ++ exit(WEXITSTATUS(wstatus)); ++ } ++ ++ exit(1); ++ } ++ ++ /* Send us SIGTERM when the parent thread terminates, see prctl(2) */ ++ prctl(PR_SET_PDEATHSIG, SIGTERM); ++ ++ /* ++ * If the mounts have shared propagation then we want to opt out so our ++ * mount changes don't affect the parent mount namespace. ++ */ ++ if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n"); ++ exit(1); ++ } ++ ++ /* The child must remount /proc to use the new pid namespace */ ++ if (mount("proc", "/proc", "proc", ++ MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n"); ++ exit(1); ++ } ++ ++ /* Now we can get our /proc/self/fd directory file descriptor */ ++ lo->proc_self_fd = open("/proc/self/fd", O_PATH); ++ if (lo->proc_self_fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); + exit(1); + } + } + +-/* This magic is based on lxc's lxc_pivot_root() */ +-static void setup_pivot_root(const char *source) ++/* ++ * Make the source directory our root so symlinks cannot escape and no other ++ * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. ++ */ ++static void setup_mounts(const char *source) + { + int oldroot; + int newroot; + ++ if (mount(source, source, NULL, MS_BIND, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); ++ exit(1); ++ } ++ ++ /* This magic is based on lxc's lxc_pivot_root() */ + oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (oldroot < 0) { + fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); +@@ -2009,47 +2083,14 @@ static void setup_pivot_root(const char *source) + close(oldroot); + } + +-static void setup_proc_self_fd(struct lo_data *lo) +-{ +- lo->proc_self_fd = open("/proc/self/fd", O_PATH); +- if (lo->proc_self_fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); +- exit(1); +- } +-} +- +-/* +- * Make the source directory our root so symlinks cannot escape and no other +- * files are accessible. +- */ +-static void setup_mount_namespace(const char *source) +-{ +- if (unshare(CLONE_NEWNS) != 0) { +- fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNS): %m\n"); +- exit(1); +- } +- +- if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { +- fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_PRIVATE): %m\n"); +- exit(1); +- } +- +- if (mount(source, source, NULL, MS_BIND, NULL) < 0) { +- fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); +- exit(1); +- } +- +- setup_pivot_root(source); +-} +- + /* + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +-static void setup_sandbox(struct lo_data *lo) ++static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) + { +- setup_net_namespace(); +- setup_mount_namespace(lo->source); ++ setup_namespaces(lo, se); ++ setup_mounts(lo->source); + } + + int main(int argc, char *argv[]) +@@ -2173,10 +2214,7 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + +- /* Must be after daemonize to get the right /proc/self/fd */ +- setup_proc_self_fd(&lo); +- +- setup_sandbox(&lo); ++ setup_sandbox(&lo, se); + + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-move-to-an-empty-network-namespace.patch b/kvm-virtiofsd-move-to-an-empty-network-namespace.patch new file mode 100755 index 0000000..69a7c20 --- /dev/null +++ b/kvm-virtiofsd-move-to-an-empty-network-namespace.patch @@ -0,0 +1,66 @@ +From 19a16f26bdeb6302159736e182a18b06160a3f42 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:28 +0100 +Subject: [PATCH 057/116] virtiofsd: move to an empty network namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-54-dgilbert@redhat.com> +Patchwork-id: 93508 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 053/112] virtiofsd: move to an empty network namespace +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +If the process is compromised there should be no network access. Use an +empty network namespace to sandbox networking. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d74830d12ae233186ff74ddf64c552d26bb39e50) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 0570453..27ab328 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1944,6 +1944,19 @@ static void print_capabilities(void) + printf("}\n"); + } + ++/* ++ * Called after our UNIX domain sockets have been created, now we can move to ++ * an empty network namespace to prevent TCP/IP and other network activity in ++ * case this process is compromised. ++ */ ++static void setup_net_namespace(void) ++{ ++ if (unshare(CLONE_NEWNET) != 0) { ++ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNET): %m\n"); ++ exit(1); ++ } ++} ++ + /* This magic is based on lxc's lxc_pivot_root() */ + static void setup_pivot_root(const char *source) + { +@@ -2035,6 +2048,7 @@ static void setup_mount_namespace(const char *source) + */ + static void setup_sandbox(struct lo_data *lo) + { ++ setup_net_namespace(); + setup_mount_namespace(lo->source); + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-only-retain-file-system-capabilities.patch b/kvm-virtiofsd-only-retain-file-system-capabilities.patch new file mode 100755 index 0000000..15c8cd8 --- /dev/null +++ b/kvm-virtiofsd-only-retain-file-system-capabilities.patch @@ -0,0 +1,112 @@ +From 8727e4904e7a6588e39f231d837f4527f265e47e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:59 +0100 +Subject: [PATCH 8/9] virtiofsd: only retain file system capabilities + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-7-dgilbert@redhat.com> +Patchwork-id: 96272 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 6/7] virtiofsd: only retain file system capabilities +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +virtiofsd runs as root but only needs a subset of root's Linux +capabilities(7). As a file server its purpose is to create and access +files on behalf of a client. It needs to be able to access files with +arbitrary uid/gid owners. It also needs to be create device nodes. + +Introduce a Linux capabilities(7) whitelist and drop all capabilities +that we don't need, making the virtiofsd process less powerful than a +regular uid root process. + + # cat /proc/PID/status + ... + Before After + CapInh: 0000000000000000 0000000000000000 + CapPrm: 0000003fffffffff 00000000880000df + CapEff: 0000003fffffffff 00000000880000df + CapBnd: 0000003fffffffff 0000000000000000 + CapAmb: 0000000000000000 0000000000000000 + +Note that file capabilities cannot be used to achieve the same effect on +the virtiofsd executable because mount is used during sandbox setup. +Therefore we drop capabilities programmatically at the right point +during startup. + +This patch only affects the sandboxed child process. The parent process +that sits in waitpid(2) still has full root capabilities and will be +addressed in the next patch. + +Signed-off-by: Stefan Hajnoczi +Message-Id: <20200416164907.244868-2-stefanha@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a59feb483b8fae24d043569ccfcc97ea23d54a02) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 38 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 38 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 614ba55..6358874 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2723,6 +2723,43 @@ static void setup_mounts(const char *source) + } + + /* ++ * Only keep whitelisted capabilities that are needed for file system operation ++ */ ++static void setup_capabilities(void) ++{ ++ pthread_mutex_lock(&cap.mutex); ++ capng_restore_state(&cap.saved); ++ ++ /* ++ * Whitelist file system-related capabilities that are needed for a file ++ * server to act like root. Drop everything else like networking and ++ * sysadmin capabilities. ++ * ++ * Exclusions: ++ * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl ++ * and we don't support that. ++ * 2. CAP_MAC_OVERRIDE is not included because it only seems to be ++ * used by the Smack LSM. Omit it until there is demand for it. ++ */ ++ capng_setpid(syscall(SYS_gettid)); ++ capng_clear(CAPNG_SELECT_BOTH); ++ capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, ++ CAP_CHOWN, ++ CAP_DAC_OVERRIDE, ++ CAP_DAC_READ_SEARCH, ++ CAP_FOWNER, ++ CAP_FSETID, ++ CAP_SETGID, ++ CAP_SETUID, ++ CAP_MKNOD, ++ CAP_SETFCAP); ++ capng_apply(CAPNG_SELECT_BOTH); ++ ++ cap.saved = capng_save_state(); ++ pthread_mutex_unlock(&cap.mutex); ++} ++ ++/* + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +@@ -2732,6 +2769,7 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, + setup_namespaces(lo, se); + setup_mounts(lo->source); + setup_seccomp(enable_syslog); ++ setup_capabilities(); + } + + /* Set the maximum number of open file descriptors */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch b/kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch new file mode 100755 index 0000000..f21d793 --- /dev/null +++ b/kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch @@ -0,0 +1,124 @@ +From f2c0b07088966c396ddcee54f4bed97cdb01192f Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 9 Feb 2021 23:14:55 -0500 +Subject: [PATCH 2/3] virtiofsd: optionally return inode pointer from + lo_do_lookup() + +RH-Author: Jon Maloy +Message-id: <20210209231456.1555472-3-jmaloy@redhat.com> +Patchwork-id: 101022 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/3] virtiofsd: optionally return inode pointer from lo_do_lookup() +Bugzilla: 1919111 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Greg Kurz +RH-Acked-by: Dr. David Alan Gilbert + +From: Stefan Hajnoczi + +lo_do_lookup() finds an existing inode or allocates a new one. It +increments nlookup so that the inode stays alive until the client +releases it. + +Existing callers don't need the struct lo_inode so the function doesn't +return it. Extend the function to optionally return the inode. The next +commit will need it. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Greg Kurz +Message-Id: <20210204150208.367837-3-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert + +(cherry-picked from commit 22d2ece71e533310da31f2857ebc4a00d91968b3) +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + tools/virtiofsd/passthrough_ll.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 518ba11c47..e5bd3d73e4 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -878,11 +878,13 @@ static void posix_locks_value_destroy(gpointer data) + } + + /* +- * Increments nlookup and caller must release refcount using +- * lo_inode_put(&parent). ++ * Increments nlookup on the inode on success. unref_inode_lolocked() must be ++ * called eventually to decrement nlookup again. If inodep is non-NULL, the ++ * inode pointer is stored and the caller must call lo_inode_put(). + */ + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, +- struct fuse_entry_param *e) ++ struct fuse_entry_param *e, ++ struct lo_inode **inodep) + { + int newfd; + int res; +@@ -891,6 +893,10 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_inode *inode = NULL; + struct lo_inode *dir = lo_inode(req, parent); + ++ if (inodep) { ++ *inodep = NULL; ++ } ++ + /* + * name_to_handle_at() and open_by_handle_at() can reach here with fuse + * mount point in guest, but we don't have its inode info in the +@@ -953,7 +959,14 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; +- lo_inode_put(lo, &inode); ++ ++ /* Transfer ownership of inode pointer to caller or drop it */ ++ if (inodep) { ++ *inodep = inode; ++ } else { ++ lo_inode_put(lo, &inode); ++ } ++ + lo_inode_put(lo, &dir); + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, +@@ -988,7 +1001,7 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + return; + } + +- err = lo_do_lookup(req, parent, name, &e); ++ err = lo_do_lookup(req, parent, name, &e, NULL); + if (err) { + fuse_reply_err(req, err); + } else { +@@ -1098,7 +1111,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + goto out; + } + +- saverr = lo_do_lookup(req, parent, name, &e); ++ saverr = lo_do_lookup(req, parent, name, &e, NULL); + if (saverr) { + goto out; + } +@@ -1599,7 +1612,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + if (plus) { + if (!is_dot_or_dotdot(name)) { +- err = lo_do_lookup(req, ino, name, &e); ++ err = lo_do_lookup(req, ino, name, &e, NULL); + if (err) { + goto error; + } +@@ -1793,7 +1806,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + fi->fh = fh; +- err = lo_do_lookup(req, parent, name, &e); ++ err = lo_do_lookup(req, parent, name, &e, NULL); + } + if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; +-- +2.18.2 + diff --git a/kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch b/kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch new file mode 100755 index 0000000..e3d5773 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch @@ -0,0 +1,54 @@ +From fe031dbbf5e287f64de9fcc9aec361e8ab492109 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:24 +0100 +Subject: [PATCH 113/116] virtiofsd/passthrough_ll: Pass errno to + fuse_reply_err() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-110-dgilbert@redhat.com> +Patchwork-id: 93559 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 109/112] virtiofsd/passthrough_ll: Pass errno to fuse_reply_err() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Xiao Yang + +lo_copy_file_range() passes -errno to fuse_reply_err() and then fuse_reply_err() +changes it to errno again, so that subsequent fuse_send_reply_iov_nofree() catches +the wrong errno.(i.e. reports "fuse: bad error value: ..."). + +Make fuse_send_reply_iov_nofree() accept the correct -errno by passing errno +directly in lo_copy_file_range(). + +Signed-off-by: Xiao Yang +Reviewed-by: Eryu Guan + +dgilbert: Sent upstream and now Merged as aa1185e153f774f1df65 +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a931b6861e59c78d861017e9c6a9c161ff49a163) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index fc15d61..e6f2399 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2441,7 +2441,7 @@ static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + + res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); + if (res < 0) { +- fuse_reply_err(req, -errno); ++ fuse_reply_err(req, errno); + } else { + fuse_reply_write(req, res); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch b/kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch new file mode 100755 index 0000000..ddacdbe --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch @@ -0,0 +1,48 @@ +From 83b03fc4a3ecf6086394363488bbebc8d55428c0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:16 +0100 +Subject: [PATCH 105/116] virtiofsd: passthrough_ll: Use cache_readdir for + directory open +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-102-dgilbert@redhat.com> +Patchwork-id: 93555 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 101/112] virtiofsd: passthrough_ll: Use cache_readdir for directory open +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Misono Tomohiro + +Since keep_cache(FOPEN_KEEP_CACHE) has no effect for directory as +described in fuse_common.h, use cache_readdir(FOPNE_CACHE_DIR) for +diretory open when cache=always mode. + +Signed-off-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9b610b09b49b1aada256097b338d49da805da6ae) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 4c61ac5..79b8b71 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1523,7 +1523,7 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + + fi->fh = fh; + if (lo->cache == CACHE_ALWAYS) { +- fi->keep_cache = 1; ++ fi->cache_readdir = 1; + } + fuse_reply_open(req, fi); + return; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch b/kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch new file mode 100755 index 0000000..0506574 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch @@ -0,0 +1,238 @@ +From 474d0adafed4d73720d6413b2903d6c4b529e5e6 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:15 +0100 +Subject: [PATCH 044/116] virtiofsd: passthrough_ll: add dirp_map to hide + lo_dirp pointers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-41-dgilbert@redhat.com> +Patchwork-id: 93495 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 040/112] virtiofsd: passthrough_ll: add dirp_map to hide lo_dirp pointers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Do not expose lo_dirp pointers to clients. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b39bce121bfad8757eec0ee41f14607b883935d3) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 103 +++++++++++++++++++++++++++++---------- + 1 file changed, 76 insertions(+), 27 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index a3ebf74..5f5a72f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -56,27 +56,10 @@ + + #include "passthrough_helpers.h" + +-/* +- * We are re-using pointers to our `struct lo_inode` +- * elements as inodes. This means that we must be able to +- * store uintptr_t values in a fuse_ino_t variable. The following +- * incantation checks this condition at compile time. +- */ +-#if defined(__GNUC__) && \ +- (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ +- !defined __cplusplus +-_Static_assert(sizeof(fuse_ino_t) >= sizeof(uintptr_t), +- "fuse_ino_t too small to hold uintptr_t values!"); +-#else +-struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { +- unsigned _uintptr_to_must_hold_fuse_ino_t +- : ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); +-}; +-#endif +- + struct lo_map_elem { + union { + struct lo_inode *inode; ++ struct lo_dirp *dirp; + ssize_t freelist; + }; + bool in_use; +@@ -123,6 +106,7 @@ struct lo_data { + int timeout_set; + struct lo_inode root; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ ++ struct lo_map dirp_map; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +@@ -253,6 +237,20 @@ static void lo_map_remove(struct lo_map *map, size_t key) + } + + /* Assumes lo->mutex is held */ ++static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) ++{ ++ struct lo_map_elem *elem; ++ ++ elem = lo_map_alloc_elem(&lo_data(req)->dirp_map); ++ if (!elem) { ++ return -1; ++ } ++ ++ elem->dirp = dirp; ++ return elem - lo_data(req)->dirp_map.elems; ++} ++ ++/* Assumes lo->mutex is held */ + static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) + { + struct lo_map_elem *elem; +@@ -861,9 +859,19 @@ struct lo_dirp { + off_t offset; + }; + +-static struct lo_dirp *lo_dirp(struct fuse_file_info *fi) ++static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) + { +- return (struct lo_dirp *)(uintptr_t)fi->fh; ++ struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; ++ ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->dirp_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ if (!elem) { ++ return NULL; ++ } ++ ++ return elem->dirp; + } + + static void lo_opendir(fuse_req_t req, fuse_ino_t ino, +@@ -873,6 +881,7 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + struct lo_data *lo = lo_data(req); + struct lo_dirp *d; + int fd; ++ ssize_t fh; + + d = calloc(1, sizeof(struct lo_dirp)); + if (d == NULL) { +@@ -892,7 +901,14 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + d->offset = 0; + d->entry = NULL; + +- fi->fh = (uintptr_t)d; ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_dirp_mapping(req, d); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ goto out_err; ++ } ++ ++ fi->fh = fh; + if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } +@@ -903,6 +919,9 @@ out_errno: + error = errno; + out_err: + if (d) { ++ if (d->dp) { ++ closedir(d->dp); ++ } + if (fd != -1) { + close(fd); + } +@@ -920,17 +939,21 @@ static int is_dot_or_dotdot(const char *name) + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { +- struct lo_dirp *d = lo_dirp(fi); +- char *buf; ++ struct lo_dirp *d; ++ char *buf = NULL; + char *p; + size_t rem = size; +- int err; ++ int err = ENOMEM; + + (void)ino; + ++ d = lo_dirp(req, fi); ++ if (!d) { ++ goto error; ++ } ++ + buf = calloc(1, size); + if (!buf) { +- err = ENOMEM; + goto error; + } + p = buf; +@@ -1028,8 +1051,21 @@ static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, + static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { +- struct lo_dirp *d = lo_dirp(fi); ++ struct lo_data *lo = lo_data(req); ++ struct lo_dirp *d; ++ + (void)ino; ++ ++ d = lo_dirp(req, fi); ++ if (!d) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ lo_map_remove(&lo->dirp_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ + closedir(d->dp); + free(d); + fuse_reply_err(req, 0); +@@ -1081,8 +1117,18 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { + int res; +- int fd = dirfd(lo_dirp(fi)->dp); ++ struct lo_dirp *d; ++ int fd; ++ + (void)ino; ++ ++ d = lo_dirp(req, fi); ++ if (!d) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ fd = dirfd(d->dp); + if (datasync) { + res = fdatasync(fd); + } else { +@@ -1614,6 +1660,8 @@ int main(int argc, char *argv[]) + root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); + root_elem->inode = &lo.root; + ++ lo_map_init(&lo.dirp_map); ++ + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; + } +@@ -1710,6 +1758,7 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); + + if (lo.root.fd >= 0) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch b/kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch new file mode 100755 index 0000000..b8de3d8 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch @@ -0,0 +1,303 @@ +From 03effbc021064bb77d231ae5ca02d1a579c71ee1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:17 +0100 +Subject: [PATCH 046/116] virtiofsd: passthrough_ll: add fallback for racy ops +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-43-dgilbert@redhat.com> +Patchwork-id: 93496 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 042/112] virtiofsd: passthrough_ll: add fallback for racy ops +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +We have two operations that cannot be done race-free on a symlink in +certain cases: utimes and link. + +Add racy fallback for these if the race-free method doesn't work. We do +our best to avoid races even in this case: + + - get absolute path by reading /proc/self/fd/NN symlink + + - lookup parent directory: after this we are safe against renames in + ancestors + + - lookup name in parent directory, and verify that we got to the original + inode, if not retry the whole thing + +Both utimes(2) and link(2) hold i_lock on the inode across the operation, +so a racing rename/delete by this fuse instance is not possible, only from +other entities changing the filesystem. + +If the "norace" option is given, then disable the racy fallbacks. + +Signed-off-by: Miklos Szeredi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5fe319a7b19c9c328e6e061bffcf1ff6cc8b89ce) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 5 +- + tools/virtiofsd/passthrough_ll.c | 157 +++++++++++++++++++++++++++++++++++---- + 2 files changed, 145 insertions(+), 17 deletions(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index b8ec5ac..5531425 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -142,7 +142,10 @@ void fuse_cmdline_help(void) + " --daemonize run in background\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" +- " allowed (default: 10)\n"); ++ " allowed (default: 10)\n" ++ " -o norace disable racy fallback\n" ++ " default: false\n" ++ ); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9815bfa..ac380ef 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -98,6 +98,7 @@ enum { + struct lo_data { + pthread_mutex_t mutex; + int debug; ++ int norace; + int writeback; + int flock; + int xattr; +@@ -124,10 +125,15 @@ static const struct fuse_opt lo_opts[] = { + { "cache=never", offsetof(struct lo_data, cache), CACHE_NEVER }, + { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, +- ++ { "norace", offsetof(struct lo_data, norace), 1 }, + FUSE_OPT_END + }; + ++static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); ++ ++static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); ++ ++ + static struct lo_data *lo_data(fuse_req_t req) + { + return (struct lo_data *)fuse_req_userdata(req); +@@ -347,23 +353,127 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino, + fuse_reply_attr(req, &buf, lo->timeout); + } + +-static int utimensat_empty_nofollow(struct lo_inode *inode, +- const struct timespec *tv) ++static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, ++ char path[PATH_MAX], struct lo_inode **parent) + { +- int res; + char procname[64]; ++ char *last; ++ struct stat stat; ++ struct lo_inode *p; ++ int retries = 2; ++ int res; ++ ++retry: ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ res = readlink(procname, path, PATH_MAX); ++ if (res < 0) { ++ fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__); ++ goto fail_noretry; ++ } ++ ++ if (res >= PATH_MAX) { ++ fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__); ++ goto fail_noretry; ++ } ++ path[res] = '\0'; ++ ++ last = strrchr(path, '/'); ++ if (last == NULL) { ++ /* Shouldn't happen */ ++ fuse_log( ++ FUSE_LOG_WARNING, ++ "%s: INTERNAL ERROR: bad path read from proc\n", __func__); ++ goto fail_noretry; ++ } ++ if (last == path) { ++ p = &lo->root; ++ pthread_mutex_lock(&lo->mutex); ++ p->refcount++; ++ pthread_mutex_unlock(&lo->mutex); ++ } else { ++ *last = '\0'; ++ res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0); ++ if (res == -1) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to stat parent: %m\n", __func__); ++ } ++ goto fail; ++ } ++ p = lo_find(lo, &stat); ++ if (p == NULL) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to find parent\n", __func__); ++ } ++ goto fail; ++ } ++ } ++ last++; ++ res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to stat last\n", __func__); ++ } ++ goto fail_unref; ++ } ++ if (stat.st_dev != inode->dev || stat.st_ino != inode->ino) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to match last\n", __func__); ++ } ++ goto fail_unref; ++ } ++ *parent = p; ++ memmove(path, last, strlen(last) + 1); ++ ++ return 0; ++ ++fail_unref: ++ unref_inode(lo, p, 1); ++fail: ++ if (retries) { ++ retries--; ++ goto retry; ++ } ++fail_noretry: ++ errno = EIO; ++ return -1; ++} ++ ++static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, ++ const struct timespec *tv) ++{ ++ int res; ++ struct lo_inode *parent; ++ char path[PATH_MAX]; + + if (inode->is_symlink) { +- res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH); + if (res == -1 && errno == EINVAL) { + /* Sorry, no race free way to set times on symlink. */ +- errno = EPERM; ++ if (lo->norace) { ++ errno = EPERM; ++ } else { ++ goto fallback; ++ } + } + return res; + } +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "/proc/self/fd/%i", inode->fd); + +- return utimensat(AT_FDCWD, procname, tv, 0); ++ return utimensat(AT_FDCWD, path, tv, 0); ++ ++fallback: ++ res = lo_parent_and_name(lo, inode, path, &parent); ++ if (res != -1) { ++ res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); ++ unref_inode(lo, parent, 1); ++ } ++ ++ return res; + } + + static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) +@@ -387,6 +497,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + { + int saverr; + char procname[64]; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + int ifd; + int res; +@@ -459,7 +570,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + if (fi) { + res = futimens(fd, tv); + } else { +- res = utimensat_empty_nofollow(inode, tv); ++ res = utimensat_empty(lo, inode, tv); + } + if (res == -1) { + goto out_err; +@@ -709,24 +820,38 @@ static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, + lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); + } + +-static int linkat_empty_nofollow(struct lo_inode *inode, int dfd, +- const char *name) ++static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, ++ int dfd, const char *name) + { + int res; +- char procname[64]; ++ struct lo_inode *parent; ++ char path[PATH_MAX]; + + if (inode->is_symlink) { + res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); + if (res == -1 && (errno == ENOENT || errno == EINVAL)) { + /* Sorry, no race free way to hard-link a symlink. */ +- errno = EPERM; ++ if (lo->norace) { ++ errno = EPERM; ++ } else { ++ goto fallback; ++ } + } + return res; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "/proc/self/fd/%i", inode->fd); ++ ++ return linkat(AT_FDCWD, path, dfd, name, AT_SYMLINK_FOLLOW); ++ ++fallback: ++ res = lo_parent_and_name(lo, inode, path, &parent); ++ if (res != -1) { ++ res = linkat(parent->fd, path, dfd, name, 0); ++ unref_inode(lo, parent, 1); ++ } + +- return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); ++ return res; + } + + static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, +@@ -748,7 +873,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; + +- res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); ++ res = linkat_empty_nofollow(lo, inode, lo_fd(req, parent), name); + if (res == -1) { + goto out_err; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch b/kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch new file mode 100755 index 0000000..24b2a6e --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch @@ -0,0 +1,328 @@ +From 35337e604e9149d6d8fcf74b8b82ac33a8611ebb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:16 +0100 +Subject: [PATCH 045/116] virtiofsd: passthrough_ll: add fd_map to hide file + descriptors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-42-dgilbert@redhat.com> +Patchwork-id: 93494 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 041/112] virtiofsd: passthrough_ll: add fd_map to hide file descriptors +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Do not expose file descriptor numbers to clients. This prevents the +abuse of internal file descriptors (like stdin/stdout). + +Signed-off-by: Stefan Hajnoczi +Fix from: +Signed-off-by: Xiao Yang +dgilbert: + Added lseek +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 73b4d19dfc4248a74c1f3e511cfa934681d9c602) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 116 +++++++++++++++++++++++++++++++-------- + 1 file changed, 94 insertions(+), 22 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 5f5a72f..9815bfa 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -60,6 +60,7 @@ struct lo_map_elem { + union { + struct lo_inode *inode; + struct lo_dirp *dirp; ++ int fd; + ssize_t freelist; + }; + bool in_use; +@@ -107,6 +108,7 @@ struct lo_data { + struct lo_inode root; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ ++ struct lo_map fd_map; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +@@ -237,6 +239,20 @@ static void lo_map_remove(struct lo_map *map, size_t key) + } + + /* Assumes lo->mutex is held */ ++static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) ++{ ++ struct lo_map_elem *elem; ++ ++ elem = lo_map_alloc_elem(&lo_data(req)->fd_map); ++ if (!elem) { ++ return -1; ++ } ++ ++ elem->fd = fd; ++ return elem - lo_data(req)->fd_map.elems; ++} ++ ++/* Assumes lo->mutex is held */ + static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) + { + struct lo_map_elem *elem; +@@ -350,6 +366,22 @@ static int utimensat_empty_nofollow(struct lo_inode *inode, + return utimensat(AT_FDCWD, procname, tv, 0); + } + ++static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; ++ ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->fd_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ ++ if (!elem) { ++ return -1; ++ } ++ ++ return elem->fd; ++} ++ + static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int valid, struct fuse_file_info *fi) + { +@@ -358,6 +390,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + struct lo_inode *inode; + int ifd; + int res; ++ int fd; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -367,9 +400,14 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + + ifd = inode->fd; + ++ /* If fi->fh is invalid we'll report EBADF later */ ++ if (fi) { ++ fd = lo_fi_fd(req, fi); ++ } ++ + if (valid & FUSE_SET_ATTR_MODE) { + if (fi) { +- res = fchmod(fi->fh, attr->st_mode); ++ res = fchmod(fd, attr->st_mode); + } else { + sprintf(procname, "/proc/self/fd/%i", ifd); + res = chmod(procname, attr->st_mode); +@@ -389,7 +427,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + } + if (valid & FUSE_SET_ATTR_SIZE) { + if (fi) { +- res = ftruncate(fi->fh, attr->st_size); ++ res = ftruncate(fd, attr->st_size); + } else { + sprintf(procname, "/proc/self/fd/%i", ifd); + res = truncate(procname, attr->st_size); +@@ -419,7 +457,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + } + + if (fi) { +- res = futimens(fi->fh, tv); ++ res = futimens(fd, tv); + } else { + res = utimensat_empty_nofollow(inode, tv); + } +@@ -1096,7 +1134,18 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + lo_restore_cred(&old); + + if (!err) { +- fi->fh = fd; ++ ssize_t fh; ++ ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_fd_mapping(req, fd); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ close(fd); ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ++ fi->fh = fh; + err = lo_do_lookup(req, parent, name, &e); + } + if (lo->cache == CACHE_NEVER) { +@@ -1140,6 +1189,7 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { + int fd; ++ ssize_t fh; + char buf[64]; + struct lo_data *lo = lo_data(req); + +@@ -1175,7 +1225,16 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + return (void)fuse_reply_err(req, errno); + } + +- fi->fh = fd; ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_fd_mapping(req, fd); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ close(fd); ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ++ fi->fh = fh; + if (lo->cache == CACHE_NEVER) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { +@@ -1187,9 +1246,18 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + static void lo_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { ++ struct lo_data *lo = lo_data(req); ++ int fd; ++ + (void)ino; + +- close(fi->fh); ++ fd = lo_fi_fd(req, fi); ++ ++ pthread_mutex_lock(&lo->mutex); ++ lo_map_remove(&lo->fd_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ ++ close(fd); + fuse_reply_err(req, 0); + } + +@@ -1197,7 +1265,7 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { + int res; + (void)ino; +- res = close(dup(fi->fh)); ++ res = close(dup(lo_fi_fd(req, fi))); + fuse_reply_err(req, res == -1 ? errno : 0); + } + +@@ -1224,7 +1292,7 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + return (void)fuse_reply_err(req, errno); + } + } else { +- fd = fi->fh; ++ fd = lo_fi_fd(req, fi); + } + + if (datasync) { +@@ -1251,7 +1319,7 @@ static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + } + + buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- buf.buf[0].fd = fi->fh; ++ buf.buf[0].fd = lo_fi_fd(req, fi); + buf.buf[0].pos = offset; + + fuse_reply_data(req, &buf); +@@ -1266,7 +1334,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); + + out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- out_buf.buf[0].fd = fi->fh; ++ out_buf.buf[0].fd = lo_fi_fd(req, fi); + out_buf.buf[0].pos = off; + + if (lo_debug(req)) { +@@ -1303,7 +1371,7 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + (void)ino; + + #ifdef CONFIG_FALLOCATE +- err = fallocate(fi->fh, mode, offset, length); ++ err = fallocate(lo_fi_fd(req, fi), mode, offset, length); + if (err < 0) { + err = errno; + } +@@ -1314,7 +1382,7 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + return; + } + +- err = posix_fallocate(fi->fh, offset, length); ++ err = posix_fallocate(lo_fi_fd(req, fi), offset, length); + #endif + + fuse_reply_err(req, err); +@@ -1326,7 +1394,7 @@ static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + int res; + (void)ino; + +- res = flock(fi->fh, op); ++ res = flock(lo_fi_fd(req, fi), op); + + fuse_reply_err(req, res == -1 ? errno : 0); + } +@@ -1551,17 +1619,19 @@ static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + off_t off_out, struct fuse_file_info *fi_out, + size_t len, int flags) + { ++ int in_fd, out_fd; + ssize_t res; + +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, +- "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, size=%zd, flags=0x%x)\n", +- ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, len, +- flags); ++ in_fd = lo_fi_fd(req, fi_in); ++ out_fd = lo_fi_fd(req, fi_out); ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, " ++ "off=%lu, ino=%" PRIu64 "/fd=%d, " ++ "off=%lu, size=%zd, flags=0x%x)\n", ++ ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags); + +- res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, flags); ++ res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); + if (res < 0) { + fuse_reply_err(req, -errno); + } else { +@@ -1576,7 +1646,7 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + off_t res; + + (void)ino; +- res = lseek(fi->fh, off, whence); ++ res = lseek(lo_fi_fd(req, fi), off, whence); + if (res != -1) { + fuse_reply_lseek(req, res); + } else { +@@ -1661,6 +1731,7 @@ int main(int argc, char *argv[]) + root_elem->inode = &lo.root; + + lo_map_init(&lo.dirp_map); ++ lo_map_init(&lo.fd_map); + + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; +@@ -1758,6 +1829,7 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ lo_map_destroy(&lo.fd_map); + lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch b/kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch new file mode 100755 index 0000000..ba8b730 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch @@ -0,0 +1,395 @@ +From d81396cc3d9815730903b0755c9d2e67d6954d54 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:14 +0100 +Subject: [PATCH 043/116] virtiofsd: passthrough_ll: add ino_map to hide + lo_inode pointers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-40-dgilbert@redhat.com> +Patchwork-id: 93493 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 039/112] virtiofsd: passthrough_ll: add ino_map to hide lo_inode pointers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Do not expose lo_inode pointers to clients. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 92fb57b83cdbfc4bf53c0c46a3d0bcbc36e64126) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 144 +++++++++++++++++++++++++++++++-------- + 1 file changed, 114 insertions(+), 30 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e83a976..a3ebf74 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -57,8 +57,8 @@ + #include "passthrough_helpers.h" + + /* +- * We are re-using pointers to our `struct lo_inode` and `struct +- * lo_dirp` elements as inodes. This means that we must be able to ++ * We are re-using pointers to our `struct lo_inode` ++ * elements as inodes. This means that we must be able to + * store uintptr_t values in a fuse_ino_t variable. The following + * incantation checks this condition at compile time. + */ +@@ -76,7 +76,7 @@ struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { + + struct lo_map_elem { + union { +- /* Element values will go here... */ ++ struct lo_inode *inode; + ssize_t freelist; + }; + bool in_use; +@@ -97,6 +97,7 @@ struct lo_inode { + ino_t ino; + dev_t dev; + uint64_t refcount; /* protected by lo->mutex */ ++ fuse_ino_t fuse_ino; + }; + + struct lo_cred { +@@ -121,6 +122,7 @@ struct lo_data { + int cache; + int timeout_set; + struct lo_inode root; /* protected by lo->mutex */ ++ struct lo_map ino_map; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +@@ -145,14 +147,14 @@ static struct lo_data *lo_data(fuse_req_t req) + return (struct lo_data *)fuse_req_userdata(req); + } + +-__attribute__((unused)) static void lo_map_init(struct lo_map *map) ++static void lo_map_init(struct lo_map *map) + { + map->elems = NULL; + map->nelems = 0; + map->freelist = -1; + } + +-__attribute__((unused)) static void lo_map_destroy(struct lo_map *map) ++static void lo_map_destroy(struct lo_map *map) + { + free(map->elems); + } +@@ -183,8 +185,7 @@ static int lo_map_grow(struct lo_map *map, size_t new_nelems) + return 1; + } + +-__attribute__((unused)) static struct lo_map_elem * +-lo_map_alloc_elem(struct lo_map *map) ++static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map) + { + struct lo_map_elem *elem; + +@@ -200,8 +201,7 @@ lo_map_alloc_elem(struct lo_map *map) + return elem; + } + +-__attribute__((unused)) static struct lo_map_elem * +-lo_map_reserve(struct lo_map *map, size_t key) ++static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key) + { + ssize_t *prev; + +@@ -222,8 +222,7 @@ lo_map_reserve(struct lo_map *map, size_t key) + return NULL; + } + +-__attribute__((unused)) static struct lo_map_elem * +-lo_map_get(struct lo_map *map, size_t key) ++static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key) + { + if (key >= map->nelems) { + return NULL; +@@ -234,8 +233,7 @@ lo_map_get(struct lo_map *map, size_t key) + return &map->elems[key]; + } + +-__attribute__((unused)) static void lo_map_remove(struct lo_map *map, +- size_t key) ++static void lo_map_remove(struct lo_map *map, size_t key) + { + struct lo_map_elem *elem; + +@@ -254,18 +252,40 @@ __attribute__((unused)) static void lo_map_remove(struct lo_map *map, + map->freelist = key; + } + ++/* Assumes lo->mutex is held */ ++static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) ++{ ++ struct lo_map_elem *elem; ++ ++ elem = lo_map_alloc_elem(&lo_data(req)->ino_map); ++ if (!elem) { ++ return -1; ++ } ++ ++ elem->inode = inode; ++ return elem - lo_data(req)->ino_map.elems; ++} ++ + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { +- if (ino == FUSE_ROOT_ID) { +- return &lo_data(req)->root; +- } else { +- return (struct lo_inode *)(uintptr_t)ino; ++ struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; ++ ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->ino_map, ino); ++ pthread_mutex_unlock(&lo->mutex); ++ ++ if (!elem) { ++ return NULL; + } ++ ++ return elem->inode; + } + + static int lo_fd(fuse_req_t req, fuse_ino_t ino) + { +- return lo_inode(req, ino)->fd; ++ struct lo_inode *inode = lo_inode(req, ino); ++ return inode ? inode->fd : -1; + } + + static bool lo_debug(fuse_req_t req) +@@ -337,10 +357,18 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + { + int saverr; + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- int ifd = inode->fd; ++ struct lo_inode *inode; ++ int ifd; + int res; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ ifd = inode->fd; ++ + if (valid & FUSE_SET_ATTR_MODE) { + if (fi) { + res = fchmod(fi->fh, attr->st_mode); +@@ -470,6 +498,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + inode->dev = e->attr.st_dev; + + pthread_mutex_lock(&lo->mutex); ++ inode->fuse_ino = lo_add_inode_mapping(req, inode); + prev = &lo->root; + next = prev->next; + next->prev = inode; +@@ -478,7 +507,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + prev->next = inode; + pthread_mutex_unlock(&lo->mutex); + } +- e->ino = (uintptr_t)inode; ++ e->ino = inode->fuse_ino; + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +@@ -582,10 +611,16 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + { + int res; + int saverr; +- struct lo_inode *dir = lo_inode(req, parent); ++ struct lo_inode *dir; + struct fuse_entry_param e; + struct lo_cred old = {}; + ++ dir = lo_inode(req, parent); ++ if (!dir) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOMEM; + + saverr = lo_change_cred(req, &old); +@@ -663,10 +698,16 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + { + int res; + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + struct fuse_entry_param e; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + memset(&e, 0, sizeof(struct fuse_entry_param)); + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; +@@ -684,7 +725,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + pthread_mutex_lock(&lo->mutex); + inode->refcount++; + pthread_mutex_unlock(&lo->mutex); +- e.ino = (uintptr_t)inode; ++ e.ino = inode->fuse_ino; + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +@@ -750,10 +791,10 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + next->prev = prev; + prev->next = next; + ++ lo_map_remove(&lo->ino_map, inode->fuse_ino); + pthread_mutex_unlock(&lo->mutex); + close(inode->fd); + free(inode); +- + } else { + pthread_mutex_unlock(&lo->mutex); + } +@@ -762,7 +803,12 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ return; ++ } + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +@@ -1244,10 +1290,16 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + { + char *value = NULL; + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1306,10 +1358,16 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + { + char *value = NULL; + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1367,10 +1425,16 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags) + { + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1400,10 +1464,16 @@ out: + static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + { + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1522,6 +1592,7 @@ int main(int argc, char *argv[]) + struct fuse_session *se; + struct fuse_cmdline_opts opts; + struct lo_data lo = { .debug = 0, .writeback = 0 }; ++ struct lo_map_elem *root_elem; + int ret = -1; + + /* Don't mask creation mode, kernel already did that */ +@@ -1530,8 +1601,19 @@ int main(int argc, char *argv[]) + pthread_mutex_init(&lo.mutex, NULL); + lo.root.next = lo.root.prev = &lo.root; + lo.root.fd = -1; ++ lo.root.fuse_ino = FUSE_ROOT_ID; + lo.cache = CACHE_NORMAL; + ++ /* ++ * Set up the ino map like this: ++ * [0] Reserved (will not be used) ++ * [1] Root inode ++ */ ++ lo_map_init(&lo.ino_map); ++ lo_map_reserve(&lo.ino_map, 0)->in_use = false; ++ root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); ++ root_elem->inode = &lo.root; ++ + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; + } +@@ -1628,6 +1710,8 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ lo_map_destroy(&lo.ino_map); ++ + if (lo.root.fd >= 0) { + close(lo.root.fd); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch b/kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch new file mode 100755 index 0000000..4751f95 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch @@ -0,0 +1,182 @@ +From d56651e227bae83ee0cceb12bd91e3e9f6045ab3 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:13 +0100 +Subject: [PATCH 042/116] virtiofsd: passthrough_ll: add lo_map for ino/fh + indirection +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-39-dgilbert@redhat.com> +Patchwork-id: 93492 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 038/112] virtiofsd: passthrough_ll: add lo_map for ino/fh indirection +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +A layer of indirection is needed because passthrough_ll cannot expose +pointers or file descriptor numbers to untrusted clients. Malicious +clients could send invalid pointers or file descriptors in order to +crash or exploit the file system daemon. + +lo_map provides an integer key->value mapping. This will be used for +ino and fh fields in the patches that follow. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 25c135727b08dca90f00094e522a69170b13dfac) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 124 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 124 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 5e06179..e83a976 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -74,6 +74,21 @@ struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { + }; + #endif + ++struct lo_map_elem { ++ union { ++ /* Element values will go here... */ ++ ssize_t freelist; ++ }; ++ bool in_use; ++}; ++ ++/* Maps FUSE fh or ino values to internal objects */ ++struct lo_map { ++ struct lo_map_elem *elems; ++ size_t nelems; ++ ssize_t freelist; ++}; ++ + struct lo_inode { + struct lo_inode *next; /* protected by lo->mutex */ + struct lo_inode *prev; /* protected by lo->mutex */ +@@ -130,6 +145,115 @@ static struct lo_data *lo_data(fuse_req_t req) + return (struct lo_data *)fuse_req_userdata(req); + } + ++__attribute__((unused)) static void lo_map_init(struct lo_map *map) ++{ ++ map->elems = NULL; ++ map->nelems = 0; ++ map->freelist = -1; ++} ++ ++__attribute__((unused)) static void lo_map_destroy(struct lo_map *map) ++{ ++ free(map->elems); ++} ++ ++static int lo_map_grow(struct lo_map *map, size_t new_nelems) ++{ ++ struct lo_map_elem *new_elems; ++ size_t i; ++ ++ if (new_nelems <= map->nelems) { ++ return 1; ++ } ++ ++ new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems); ++ if (!new_elems) { ++ return 0; ++ } ++ ++ for (i = map->nelems; i < new_nelems; i++) { ++ new_elems[i].freelist = i + 1; ++ new_elems[i].in_use = false; ++ } ++ new_elems[new_nelems - 1].freelist = -1; ++ ++ map->elems = new_elems; ++ map->freelist = map->nelems; ++ map->nelems = new_nelems; ++ return 1; ++} ++ ++__attribute__((unused)) static struct lo_map_elem * ++lo_map_alloc_elem(struct lo_map *map) ++{ ++ struct lo_map_elem *elem; ++ ++ if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) { ++ return NULL; ++ } ++ ++ elem = &map->elems[map->freelist]; ++ map->freelist = elem->freelist; ++ ++ elem->in_use = true; ++ ++ return elem; ++} ++ ++__attribute__((unused)) static struct lo_map_elem * ++lo_map_reserve(struct lo_map *map, size_t key) ++{ ++ ssize_t *prev; ++ ++ if (!lo_map_grow(map, key + 1)) { ++ return NULL; ++ } ++ ++ for (prev = &map->freelist; *prev != -1; ++ prev = &map->elems[*prev].freelist) { ++ if (*prev == key) { ++ struct lo_map_elem *elem = &map->elems[key]; ++ ++ *prev = elem->freelist; ++ elem->in_use = true; ++ return elem; ++ } ++ } ++ return NULL; ++} ++ ++__attribute__((unused)) static struct lo_map_elem * ++lo_map_get(struct lo_map *map, size_t key) ++{ ++ if (key >= map->nelems) { ++ return NULL; ++ } ++ if (!map->elems[key].in_use) { ++ return NULL; ++ } ++ return &map->elems[key]; ++} ++ ++__attribute__((unused)) static void lo_map_remove(struct lo_map *map, ++ size_t key) ++{ ++ struct lo_map_elem *elem; ++ ++ if (key >= map->nelems) { ++ return; ++ } ++ ++ elem = &map->elems[key]; ++ if (!elem->in_use) { ++ return; ++ } ++ ++ elem->in_use = false; ++ ++ elem->freelist = map->freelist; ++ map->freelist = key; ++} ++ + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { + if (ino == FUSE_ROOT_ID) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch b/kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch new file mode 100755 index 0000000..a3f7970 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch @@ -0,0 +1,52 @@ +From 86b4f2865f2ebd7e6b3d85beb66a9390eb46eb96 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:45 +0100 +Subject: [PATCH 074/116] virtiofsd: passthrough_ll: add renameat2 support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-71-dgilbert@redhat.com> +Patchwork-id: 93531 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 070/112] virtiofsd: passthrough_ll: add renameat2 support +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f0ab7d6f78a7d3c1c19fd81a91c9b1199f56c4f6) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 98114a3..18d69ab 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1099,7 +1099,17 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + if (flags) { ++#ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); ++#else ++ res = syscall(SYS_renameat2, lo_fd(req, parent), name, ++ lo_fd(req, newparent), newname, flags); ++ if (res == -1 && errno == ENOSYS) { ++ fuse_reply_err(req, EINVAL); ++ } else { ++ fuse_reply_err(req, res == -1 ? errno : 0); ++ } ++#endif + return; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch b/kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch new file mode 100755 index 0000000..dc87ef2 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch @@ -0,0 +1,138 @@ +From 079199c53f483f0051f994b195ebb595aec76a39 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:51 +0100 +Subject: [PATCH 080/116] virtiofsd: passthrough_ll: clean up cache related + options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-77-dgilbert@redhat.com> +Patchwork-id: 93530 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 076/112] virtiofsd: passthrough_ll: clean up cache related options +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + + - Rename "cache=never" to "cache=none" to match 9p's similar option. + + - Rename CACHE_NORMAL constant to CACHE_AUTO to match the "cache=auto" + option. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 230e777b5e250759ee0480fcc0e9ccfa2b082fba) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 5 ++++- + tools/virtiofsd/passthrough_ll.c | 20 ++++++++++---------- + 2 files changed, 14 insertions(+), 11 deletions(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 14f5d70..5672024 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -145,6 +145,9 @@ void fuse_cmdline_help(void) + " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" ++ " -o cache= cache mode. could be one of \"auto, " ++ "always, none\"\n" ++ " default: auto\n" + " -o log_level= log level, default to \"info\"\n" + " level could be one of \"debug, " + "info, warn, err\"\n" +@@ -156,7 +159,7 @@ void fuse_cmdline_help(void) + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " +- "cache=never\n" ++ "cache=none\n" + ); + } + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9e7191e..b40f287 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -101,8 +101,8 @@ struct lo_cred { + }; + + enum { +- CACHE_NEVER, +- CACHE_NORMAL, ++ CACHE_NONE, ++ CACHE_AUTO, + CACHE_ALWAYS, + }; + +@@ -138,8 +138,8 @@ static const struct fuse_opt lo_opts[] = { + { "no_xattr", offsetof(struct lo_data, xattr), 0 }, + { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, + { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, +- { "cache=never", offsetof(struct lo_data, cache), CACHE_NEVER }, +- { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, ++ { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, ++ { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, + { "norace", offsetof(struct lo_data, norace), 1 }, + { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, +@@ -482,7 +482,7 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } +- if ((lo->cache == CACHE_NEVER && !lo->readdirplus_set) || ++ if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || + lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); + conn->want &= ~FUSE_CAP_READDIRPLUS; +@@ -1493,7 +1493,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + fi->fh = fh; + err = lo_do_lookup(req, parent, name, &e); + } +- if (lo->cache == CACHE_NEVER) { ++ if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; +@@ -1578,7 +1578,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + } + + fi->fh = fh; +- if (lo->cache == CACHE_NEVER) { ++ if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; +@@ -2395,7 +2395,7 @@ int main(int argc, char *argv[]) + lo.root.next = lo.root.prev = &lo.root; + lo.root.fd = -1; + lo.root.fuse_ino = FUSE_ROOT_ID; +- lo.cache = CACHE_NORMAL; ++ lo.cache = CACHE_AUTO; + + /* + * Set up the ino map like this: +@@ -2470,11 +2470,11 @@ int main(int argc, char *argv[]) + } + if (!lo.timeout_set) { + switch (lo.cache) { +- case CACHE_NEVER: ++ case CACHE_NONE: + lo.timeout = 0.0; + break; + +- case CACHE_NORMAL: ++ case CACHE_AUTO: + lo.timeout = 1.0; + break; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch b/kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch new file mode 100755 index 0000000..c55eead --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch @@ -0,0 +1,154 @@ +From f93ea308351cbe2630d7ecf637c3b69894d84a11 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:13 +0000 +Subject: [PATCH 17/18] virtiofsd: passthrough_ll: cleanup getxattr/listxattr +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-7-dgilbert@redhat.com> +Patchwork-id: 94125 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 6/7] virtiofsd: passthrough_ll: cleanup getxattr/listxattr +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: Misono Tomohiro + +This is a cleanup patch to simplify the following xattr fix and +there is no functional changes. + +- Move memory allocation to head of the function +- Unify fgetxattr/flistxattr call for both size == 0 and + size != 0 case +- Remove redundant lo_inode_put call in error path + (Note: second call is ignored now since @inode is already NULL) + +Signed-off-by: Misono Tomohiro +Message-Id: <20200227055927.24566-2-misono.tomohiro@jp.fujitsu.com> +Acked-by: Vivek Goyal +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 16e15a73089102c3d8846792d514e769300fcc3c) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 54 ++++++++++++++++------------------------ + 1 file changed, 22 insertions(+), 32 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c635fc8..50c7273 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2199,34 +2199,30 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ } ++ + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + ++ ret = fgetxattr(fd, name, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } + if (size) { +- value = malloc(size); +- if (!value) { +- goto out_err; +- } +- +- ret = fgetxattr(fd, name, value, size); +- if (ret == -1) { +- goto out_err; +- } + saverr = 0; + if (ret == 0) { + goto out; + } +- + fuse_reply_buf(req, value, ret); + } else { +- ret = fgetxattr(fd, name, NULL, 0); +- if (ret == -1) { +- goto out_err; +- } +- + fuse_reply_xattr(req, ret); + } + out_free: +@@ -2242,7 +2238,6 @@ out_free: + out_err: + saverr = errno; + out: +- lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +@@ -2277,34 +2272,30 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out; + } + ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ } ++ + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + ++ ret = flistxattr(fd, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } + if (size) { +- value = malloc(size); +- if (!value) { +- goto out_err; +- } +- +- ret = flistxattr(fd, value, size); +- if (ret == -1) { +- goto out_err; +- } + saverr = 0; + if (ret == 0) { + goto out; + } +- + fuse_reply_buf(req, value, ret); + } else { +- ret = flistxattr(fd, NULL, 0); +- if (ret == -1) { +- goto out_err; +- } +- + fuse_reply_xattr(req, ret); + } + out_free: +@@ -2320,7 +2311,6 @@ out_free: + out_err: + saverr = errno; + out: +- lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-control-readdirplus.patch b/kvm-virtiofsd-passthrough_ll-control-readdirplus.patch new file mode 100755 index 0000000..98d00fc --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-control-readdirplus.patch @@ -0,0 +1,79 @@ +From 0f1d456fad4ba6a696eff8976b9fe8a0f251e1b5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:47 +0100 +Subject: [PATCH 076/116] virtiofsd: passthrough_ll: control readdirplus +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-73-dgilbert@redhat.com> +Patchwork-id: 93524 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 072/112] virtiofsd: passthrough_ll: control readdirplus +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 59aef494be2d8d91055ff3f3a8eb13d9f32873d8) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 4 ++++ + tools/virtiofsd/passthrough_ll.c | 7 ++++++- + 2 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 6d50a46..14f5d70 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -153,6 +153,10 @@ void fuse_cmdline_help(void) + " allowed (default: 10)\n" + " -o norace disable racy fallback\n" + " default: false\n" ++ " -o readdirplus|no_readdirplus\n" ++ " enable/disable readirplus\n" ++ " default: readdirplus except with " ++ "cache=never\n" + ); + } + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 6480c51..8b1784f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -117,6 +117,8 @@ struct lo_data { + double timeout; + int cache; + int timeout_set; ++ int readdirplus_set; ++ int readdirplus_clear; + struct lo_inode root; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ +@@ -140,6 +142,8 @@ static const struct fuse_opt lo_opts[] = { + { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, + { "norace", offsetof(struct lo_data, norace), 1 }, ++ { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, ++ { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, + FUSE_OPT_END + }; + static bool use_syslog = false; +@@ -478,7 +482,8 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } +- if (lo->cache == CACHE_NEVER) { ++ if ((lo->cache == CACHE_NEVER && !lo->readdirplus_set) || ++ lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); + conn->want &= ~FUSE_CAP_READDIRPLUS; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch b/kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch new file mode 100755 index 0000000..4b02779 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch @@ -0,0 +1,198 @@ +From af14ef1dba9356e566c9c7531b8fd23361c2b16d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:12 +0100 +Subject: [PATCH 041/116] virtiofsd: passthrough_ll: create new files in + caller's context +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-38-dgilbert@redhat.com> +Patchwork-id: 93488 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 037/112] virtiofsd: passthrough_ll: create new files in caller's context +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +We need to create files in the caller's context. Otherwise after +creating a file, the caller might not be able to do file operations on +that file. + +Changed effective uid/gid to caller's uid/gid, create file and then +switch back to uid/gid 0. + +Use syscall(setresuid, ...) otherwise glibc does some magic to change EUID +in all threads, which is not what we want. + +Signed-off-by: Vivek Goyal +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 929cfb7a9a1b101cdfc9ac19807ecab4c81a13e4) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 96 +++++++++++++++++++++++++++++++++++++--- + 1 file changed, 91 insertions(+), 5 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index cd27c09..5e06179 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -83,6 +84,11 @@ struct lo_inode { + uint64_t refcount; /* protected by lo->mutex */ + }; + ++struct lo_cred { ++ uid_t euid; ++ gid_t egid; ++}; ++ + enum { + CACHE_NEVER, + CACHE_NORMAL, +@@ -383,6 +389,69 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + } + } + ++/* ++ * On some archs, setres*id is limited to 2^16 but they ++ * provide setres*id32 variants that allow 2^32. ++ * Others just let setres*id do 2^32 anyway. ++ */ ++#ifdef SYS_setresgid32 ++#define OURSYS_setresgid SYS_setresgid32 ++#else ++#define OURSYS_setresgid SYS_setresgid ++#endif ++ ++#ifdef SYS_setresuid32 ++#define OURSYS_setresuid SYS_setresuid32 ++#else ++#define OURSYS_setresuid SYS_setresuid ++#endif ++ ++/* ++ * Change to uid/gid of caller so that file is created with ++ * ownership of caller. ++ * TODO: What about selinux context? ++ */ ++static int lo_change_cred(fuse_req_t req, struct lo_cred *old) ++{ ++ int res; ++ ++ old->euid = geteuid(); ++ old->egid = getegid(); ++ ++ res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1); ++ if (res == -1) { ++ return errno; ++ } ++ ++ res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1); ++ if (res == -1) { ++ int errno_save = errno; ++ ++ syscall(OURSYS_setresgid, -1, old->egid, -1); ++ return errno_save; ++ } ++ ++ return 0; ++} ++ ++/* Regain Privileges */ ++static void lo_restore_cred(struct lo_cred *old) ++{ ++ int res; ++ ++ res = syscall(OURSYS_setresuid, -1, old->euid, -1); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid); ++ exit(1); ++ } ++ ++ res = syscall(OURSYS_setresgid, -1, old->egid, -1); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); ++ exit(1); ++ } ++} ++ + static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + const char *name, mode_t mode, dev_t rdev, + const char *link) +@@ -391,12 +460,21 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + int saverr; + struct lo_inode *dir = lo_inode(req, parent); + struct fuse_entry_param e; ++ struct lo_cred old = {}; + + saverr = ENOMEM; + ++ saverr = lo_change_cred(req, &old); ++ if (saverr) { ++ goto out; ++ } ++ + res = mknod_wrapper(dir->fd, name, link, mode, rdev); + + saverr = errno; ++ ++ lo_restore_cred(&old); ++ + if (res == -1) { + goto out; + } +@@ -794,26 +872,34 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_data *lo = lo_data(req); + struct fuse_entry_param e; + int err; ++ struct lo_cred old = {}; + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", + parent, name); + } + ++ err = lo_change_cred(req, &old); ++ if (err) { ++ goto out; ++ } ++ + fd = openat(lo_fd(req, parent), name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); +- if (fd == -1) { +- return (void)fuse_reply_err(req, errno); +- } ++ err = fd == -1 ? errno : 0; ++ lo_restore_cred(&old); + +- fi->fh = fd; ++ if (!err) { ++ fi->fh = fd; ++ err = lo_do_lookup(req, parent, name, &e); ++ } + if (lo->cache == CACHE_NEVER) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } + +- err = lo_do_lookup(req, parent, name, &e); ++out: + if (err) { + fuse_reply_err(req, err); + } else { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch b/kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch new file mode 100755 index 0000000..4a531a3 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch @@ -0,0 +1,50 @@ +From bbf92338e5e5eed796d511d2bd3c3686b7d1e5fd Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:46 +0100 +Subject: [PATCH 075/116] virtiofsd: passthrough_ll: disable readdirplus on + cache=never +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-72-dgilbert@redhat.com> +Patchwork-id: 93525 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 071/112] virtiofsd: passthrough_ll: disable readdirplus on cache=never +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +...because the attributes sent in the READDIRPLUS reply would be discarded +anyway. + +Signed-off-by: Miklos Szeredi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ddcbabcb0ea177be3ec3500726b699c7c26ffd93) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 18d69ab..6480c51 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -478,6 +478,10 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } ++ if (lo->cache == CACHE_NEVER) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); ++ conn->want &= ~FUSE_CAP_READDIRPLUS; ++ } + } + + static void lo_getattr(fuse_req_t req, fuse_ino_t ino, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch b/kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch new file mode 100755 index 0000000..00e11b4 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch @@ -0,0 +1,143 @@ +From 5e33269d5fbc4ba4614bab4a6b9e0ef759bebcb7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:10 +0100 +Subject: [PATCH 099/116] virtiofsd: passthrough_ll: fix refcounting on + remove/rename +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-96-dgilbert@redhat.com> +Patchwork-id: 93549 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 095/112] virtiofsd: passthrough_ll: fix refcounting on remove/rename +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9257e514d861afa759c36704e1904d43ca3fec88) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 50 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 49 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c819b5f..e3a6d6b 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1140,17 +1140,42 @@ out_err: + fuse_reply_err(req, saverr); + } + ++static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, ++ const char *name) ++{ ++ int res; ++ struct stat attr; ++ ++ res = fstatat(lo_fd(req, parent), name, &attr, ++ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ return NULL; ++ } ++ ++ return lo_find(lo_data(req), &attr); ++} ++ + static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; ++ struct lo_inode *inode; ++ struct lo_data *lo = lo_data(req); ++ + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + ++ inode = lookup_name(req, parent, name); ++ if (!inode) { ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ + res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + + fuse_reply_err(req, res == -1 ? errno : 0); ++ unref_inode_lolocked(lo, inode, 1); + } + + static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, +@@ -1158,12 +1183,23 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + unsigned int flags) + { + int res; ++ struct lo_inode *oldinode; ++ struct lo_inode *newinode; ++ struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { + fuse_reply_err(req, EINVAL); + return; + } + ++ oldinode = lookup_name(req, parent, name); ++ newinode = lookup_name(req, newparent, newname); ++ ++ if (!oldinode) { ++ fuse_reply_err(req, EIO); ++ goto out; ++ } ++ + if (flags) { + #ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); +@@ -1176,26 +1212,38 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_reply_err(req, res == -1 ? errno : 0); + } + #endif +- return; ++ goto out; + } + + res = renameat(lo_fd(req, parent), name, lo_fd(req, newparent), newname); + + fuse_reply_err(req, res == -1 ? errno : 0); ++out: ++ unref_inode_lolocked(lo, oldinode, 1); ++ unref_inode_lolocked(lo, newinode, 1); + } + + static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; ++ struct lo_inode *inode; ++ struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + ++ inode = lookup_name(req, parent, name); ++ if (!inode) { ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ + res = unlinkat(lo_fd(req, parent), name, 0); + + fuse_reply_err(req, res == -1 ? errno : 0); ++ unref_inode_lolocked(lo, inode, 1); + } + + static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-use-hashtable.patch b/kvm-virtiofsd-passthrough_ll-use-hashtable.patch new file mode 100755 index 0000000..b0be1f9 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-use-hashtable.patch @@ -0,0 +1,211 @@ +From 44f4434b1305f6ff47b4f63fafcf39bcea9e4ceb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:52 +0100 +Subject: [PATCH 081/116] virtiofsd: passthrough_ll: use hashtable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-78-dgilbert@redhat.com> +Patchwork-id: 93528 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 077/112] virtiofsd: passthrough_ll: use hashtable +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Improve performance of inode lookup by using a hash table. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit bfc50a6e06b10b2f9dbaf6c1a89dd523322e016f) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 81 ++++++++++++++++++++++------------------ + 1 file changed, 45 insertions(+), 36 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index b40f287..b176a31 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -84,13 +84,15 @@ struct lo_map { + ssize_t freelist; + }; + ++struct lo_key { ++ ino_t ino; ++ dev_t dev; ++}; ++ + struct lo_inode { +- struct lo_inode *next; /* protected by lo->mutex */ +- struct lo_inode *prev; /* protected by lo->mutex */ + int fd; + bool is_symlink; +- ino_t ino; +- dev_t dev; ++ struct lo_key key; + uint64_t refcount; /* protected by lo->mutex */ + fuse_ino_t fuse_ino; + }; +@@ -119,7 +121,8 @@ struct lo_data { + int timeout_set; + int readdirplus_set; + int readdirplus_clear; +- struct lo_inode root; /* protected by lo->mutex */ ++ struct lo_inode root; ++ GHashTable *inodes; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ + struct lo_map fd_map; /* protected by lo->mutex */ +@@ -573,7 +576,7 @@ retry: + } + goto fail_unref; + } +- if (stat.st_dev != inode->dev || stat.st_ino != inode->ino) { ++ if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) { + if (!retries) { + fuse_log(FUSE_LOG_WARNING, + "%s: failed to match last\n", __func__); +@@ -753,19 +756,20 @@ out_err: + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + { + struct lo_inode *p; +- struct lo_inode *ret = NULL; ++ struct lo_key key = { ++ .ino = st->st_ino, ++ .dev = st->st_dev, ++ }; + + pthread_mutex_lock(&lo->mutex); +- for (p = lo->root.next; p != &lo->root; p = p->next) { +- if (p->ino == st->st_ino && p->dev == st->st_dev) { +- assert(p->refcount > 0); +- ret = p; +- ret->refcount++; +- break; +- } ++ p = g_hash_table_lookup(lo->inodes, &key); ++ if (p) { ++ assert(p->refcount > 0); ++ p->refcount++; + } + pthread_mutex_unlock(&lo->mutex); +- return ret; ++ ++ return p; + } + + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, +@@ -810,8 +814,6 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + close(newfd); + newfd = -1; + } else { +- struct lo_inode *prev, *next; +- + saverr = ENOMEM; + inode = calloc(1, sizeof(struct lo_inode)); + if (!inode) { +@@ -822,17 +824,12 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + inode->refcount = 1; + inode->fd = newfd; + newfd = -1; +- inode->ino = e->attr.st_ino; +- inode->dev = e->attr.st_dev; ++ inode->key.ino = e->attr.st_ino; ++ inode->key.dev = e->attr.st_dev; + + pthread_mutex_lock(&lo->mutex); + inode->fuse_ino = lo_add_inode_mapping(req, inode); +- prev = &lo->root; +- next = prev->next; +- next->prev = inode; +- inode->next = next; +- inode->prev = prev; +- prev->next = inode; ++ g_hash_table_insert(lo->inodes, &inode->key, inode); + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; +@@ -1162,14 +1159,8 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + assert(inode->refcount >= n); + inode->refcount -= n; + if (!inode->refcount) { +- struct lo_inode *prev, *next; +- +- prev = inode->prev; +- next = inode->next; +- next->prev = prev; +- prev->next = next; +- + lo_map_remove(&lo->ino_map, inode->fuse_ino); ++ g_hash_table_remove(lo->inodes, &inode->key); + pthread_mutex_unlock(&lo->mutex); + close(inode->fd); + free(inode); +@@ -1369,7 +1360,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + /* Hide root's parent directory */ + if (dinode == &lo->root && strcmp(name, "..") == 0) { +- e.attr.st_ino = lo->root.ino; ++ e.attr.st_ino = lo->root.key.ino; + e.attr.st_mode = DT_DIR << 12; + } + +@@ -2370,11 +2361,26 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + + root->is_symlink = false; + root->fd = fd; +- root->ino = stat.st_ino; +- root->dev = stat.st_dev; ++ root->key.ino = stat.st_ino; ++ root->key.dev = stat.st_dev; + root->refcount = 2; + } + ++static guint lo_key_hash(gconstpointer key) ++{ ++ const struct lo_key *lkey = key; ++ ++ return (guint)lkey->ino + (guint)lkey->dev; ++} ++ ++static gboolean lo_key_equal(gconstpointer a, gconstpointer b) ++{ ++ const struct lo_key *la = a; ++ const struct lo_key *lb = b; ++ ++ return la->ino == lb->ino && la->dev == lb->dev; ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2392,7 +2398,7 @@ int main(int argc, char *argv[]) + umask(0); + + pthread_mutex_init(&lo.mutex, NULL); +- lo.root.next = lo.root.prev = &lo.root; ++ lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); + lo.root.fd = -1; + lo.root.fuse_ino = FUSE_ROOT_ID; + lo.cache = CACHE_AUTO; +@@ -2522,6 +2528,9 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ if (lo.inodes) { ++ g_hash_table_destroy(lo.inodes); ++ } + lo_map_destroy(&lo.fd_map); + lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch b/kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch new file mode 100755 index 0000000..68eb03e --- /dev/null +++ b/kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch @@ -0,0 +1,54 @@ +From feb005dfeb15dd5ac5156c994f323ab4c573b1fc Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:24 +0100 +Subject: [PATCH 053/116] virtiofsd: prevent ".." escape in lo_do_lookup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-50-dgilbert@redhat.com> +Patchwork-id: 93500 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 049/112] virtiofsd: prevent ".." escape in lo_do_lookup() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 854684bc0b3d63eb90b3abdfe471c2e4271ef176) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e375406..79d5966 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -624,12 +624,17 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + int res; + int saverr; + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode; ++ struct lo_inode *inode, *dir = lo_inode(req, parent); + + memset(e, 0, sizeof(*e)); + e->attr_timeout = lo->timeout; + e->entry_timeout = lo->timeout; + ++ /* Do not allow escaping root directory */ ++ if (dir == &lo->root && strcmp(name, "..") == 0) { ++ name = "."; ++ } ++ + newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); + if (newfd == -1) { + goto out_err; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch b/kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch new file mode 100755 index 0000000..5f97cbf --- /dev/null +++ b/kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch @@ -0,0 +1,108 @@ +From 97e232e75bbc0032f4a309d248f383384612eafe Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:25 +0100 +Subject: [PATCH 054/116] virtiofsd: prevent ".." escape in lo_do_readdir() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-51-dgilbert@redhat.com> +Patchwork-id: 93507 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 050/112] virtiofsd: prevent ".." escape in lo_do_readdir() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Construct a fake dirent for the root directory's ".." entry. This hides +the parent directory from the FUSE client. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 752272da2b68a2312f0e11fc5303015a6c3ee1ac) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 36 ++++++++++++++++++++++-------------- + 1 file changed, 22 insertions(+), 14 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 79d5966..e3d65c3 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1149,19 +1149,25 @@ out_err: + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { ++ struct lo_data *lo = lo_data(req); + struct lo_dirp *d; ++ struct lo_inode *dinode; + char *buf = NULL; + char *p; + size_t rem = size; +- int err = ENOMEM; ++ int err = EBADF; + +- (void)ino; ++ dinode = lo_inode(req, ino); ++ if (!dinode) { ++ goto error; ++ } + + d = lo_dirp(req, fi); + if (!d) { + goto error; + } + ++ err = ENOMEM; + buf = calloc(1, size); + if (!buf) { + goto error; +@@ -1192,15 +1198,21 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + } + nextoff = d->entry->d_off; + name = d->entry->d_name; ++ + fuse_ino_t entry_ino = 0; ++ struct fuse_entry_param e = (struct fuse_entry_param){ ++ .attr.st_ino = d->entry->d_ino, ++ .attr.st_mode = d->entry->d_type << 12, ++ }; ++ ++ /* Hide root's parent directory */ ++ if (dinode == &lo->root && strcmp(name, "..") == 0) { ++ e.attr.st_ino = lo->root.ino; ++ e.attr.st_mode = DT_DIR << 12; ++ } ++ + if (plus) { +- struct fuse_entry_param e; +- if (is_dot_or_dotdot(name)) { +- e = (struct fuse_entry_param){ +- .attr.st_ino = d->entry->d_ino, +- .attr.st_mode = d->entry->d_type << 12, +- }; +- } else { ++ if (!is_dot_or_dotdot(name)) { + err = lo_do_lookup(req, ino, name, &e); + if (err) { + goto error; +@@ -1210,11 +1222,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); + } else { +- struct stat st = { +- .st_ino = d->entry->d_ino, +- .st_mode = d->entry->d_type << 12, +- }; +- entsize = fuse_add_direntry(req, p, rem, name, &st, nextoff); ++ entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff); + } + if (entsize > rem) { + if (entry_ino != 0) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch b/kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch new file mode 100755 index 0000000..be7c120 --- /dev/null +++ b/kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch @@ -0,0 +1,103 @@ +From 249c02ae54739dc5894ee1b2905bbe8f1e79e909 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:20 +0100 +Subject: [PATCH 109/116] virtiofsd: prevent FUSE_INIT/FUSE_DESTROY races +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-106-dgilbert@redhat.com> +Patchwork-id: 93562 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 105/112] virtiofsd: prevent FUSE_INIT/FUSE_DESTROY races +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +When running with multiple threads it can be tricky to handle +FUSE_INIT/FUSE_DESTROY in parallel with other request types or in +parallel with themselves. Serialize FUSE_INIT and FUSE_DESTROY so that +malicious clients cannot trigger race conditions. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit cdc497c6925be745bc895355bd4674a17a4b2a8b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 18 ++++++++++++++++++ + 2 files changed, 19 insertions(+) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index a20854f..1447d86 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -61,6 +61,7 @@ struct fuse_session { + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; ++ pthread_rwlock_t init_rwlock; + int got_destroy; + int broken_splice_nonblock; + uint64_t notify_ctr; +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index dab6a31..79a4031 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2428,6 +2428,19 @@ void fuse_session_process_buf_int(struct fuse_session *se, + req->ctx.pid = in->pid; + req->ch = ch; + ++ /* ++ * INIT and DESTROY requests are serialized, all other request types ++ * run in parallel. This prevents races between FUSE_INIT and ordinary ++ * requests, FUSE_INIT and FUSE_INIT, FUSE_INIT and FUSE_DESTROY, and ++ * FUSE_DESTROY and FUSE_DESTROY. ++ */ ++ if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT || ++ in->opcode == FUSE_DESTROY) { ++ pthread_rwlock_wrlock(&se->init_rwlock); ++ } else { ++ pthread_rwlock_rdlock(&se->init_rwlock); ++ } ++ + err = EIO; + if (!se->got_init) { + enum fuse_opcode expected; +@@ -2485,10 +2498,13 @@ void fuse_session_process_buf_int(struct fuse_session *se, + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter); + } ++ ++ pthread_rwlock_unlock(&se->init_rwlock); + return; + + reply_err: + fuse_reply_err(req, err); ++ pthread_rwlock_unlock(&se->init_rwlock); + } + + #define LL_OPTION(n, o, v) \ +@@ -2531,6 +2547,7 @@ void fuse_session_destroy(struct fuse_session *se) + se->op.destroy(se->userdata); + } + } ++ pthread_rwlock_destroy(&se->init_rwlock); + pthread_mutex_destroy(&se->lock); + free(se->cuse_data); + if (se->fd != -1) { +@@ -2610,6 +2627,7 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + list_init_req(&se->list); + list_init_req(&se->interrupts); + fuse_mutex_init(&se->lock); ++ pthread_rwlock_init(&se->init_rwlock, NULL); + + memcpy(&se->op, op, op_size); + se->owner = getuid(); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch b/kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch new file mode 100755 index 0000000..8eabede --- /dev/null +++ b/kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch @@ -0,0 +1,149 @@ +From 69c6a829f8136a8c95ccdf480f2fd0173d64b6ec Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:05 +0100 +Subject: [PATCH 094/116] virtiofsd: prevent fv_queue_thread() vs virtio_loop() + races +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-91-dgilbert@redhat.com> +Patchwork-id: 93544 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 090/112] virtiofsd: prevent fv_queue_thread() vs virtio_loop() races +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +We call into libvhost-user from the virtqueue handler thread and the +vhost-user message processing thread without a lock. There is nothing +protecting the virtqueue handler thread if the vhost-user message +processing thread changes the virtqueue or memory table while it is +running. + +This patch introduces a read-write lock. Virtqueue handler threads are +readers. The vhost-user message processing thread is a writer. This +will allow concurrency for multiqueue in the future while protecting +against fv_queue_thread() vs virtio_loop() races. + +Note that the critical sections could be made smaller but it would be +more invasive and require libvhost-user changes. Let's start simple and +improve performance later, if necessary. Another option would be an +RCU-style approach with lighter-weight primitives. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e7b337326d594b71b07cd6dbb332c49c122c80a4) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 34 +++++++++++++++++++++++++++++++++- + 1 file changed, 33 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index fb8d6d1..f6242f9 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -59,6 +59,18 @@ struct fv_VuDev { + struct fuse_session *se; + + /* ++ * Either handle virtqueues or vhost-user protocol messages. Don't do ++ * both at the same time since that could lead to race conditions if ++ * virtqueues or memory tables change while another thread is accessing ++ * them. ++ * ++ * The assumptions are: ++ * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev. ++ * 2. virtio_loop() reads/writes virtqueues and VuDev. ++ */ ++ pthread_rwlock_t vu_dispatch_rwlock; ++ ++ /* + * The following pair of fields are only accessed in the main + * virtio_loop + */ +@@ -415,6 +427,8 @@ static void *fv_queue_thread(void *opaque) + qi->qidx, qi->kick_fd); + while (1) { + struct pollfd pf[2]; ++ int ret; ++ + pf[0].fd = qi->kick_fd; + pf[0].events = POLLIN; + pf[0].revents = 0; +@@ -461,6 +475,9 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); + break; + } ++ /* Mutual exclusion with virtio_loop() */ ++ ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ assert(ret == 0); /* there is no possible error case */ + /* out is from guest, in is too guest */ + unsigned int in_bytes, out_bytes; + vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); +@@ -469,6 +486,7 @@ static void *fv_queue_thread(void *opaque) + "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + ++ + while (1) { + bool allocated_bufv = false; + struct fuse_bufvec bufv; +@@ -597,6 +615,8 @@ static void *fv_queue_thread(void *opaque) + free(elem); + elem = NULL; + } ++ ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + } + out: + pthread_mutex_destroy(&ch.lock); +@@ -711,6 +731,8 @@ int virtio_loop(struct fuse_session *se) + + while (!fuse_session_exited(se)) { + struct pollfd pf[1]; ++ bool ok; ++ int ret; + pf[0].fd = se->vu_socketfd; + pf[0].events = POLLIN; + pf[0].revents = 0; +@@ -735,7 +757,15 @@ int virtio_loop(struct fuse_session *se) + } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); +- if (!vu_dispatch(&se->virtio_dev->dev)) { ++ /* Mutual exclusion with fv_queue_thread() */ ++ ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock); ++ assert(ret == 0); /* there is no possible error case */ ++ ++ ok = vu_dispatch(&se->virtio_dev->dev); ++ ++ pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock); ++ ++ if (!ok) { + fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); + break; + } +@@ -877,6 +907,7 @@ int virtio_session_mount(struct fuse_session *se) + + se->vu_socketfd = data_sock; + se->virtio_dev->se = se; ++ pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL); + vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch, + fv_remove_watch, &fv_iface); + +@@ -892,6 +923,7 @@ void virtio_session_close(struct fuse_session *se) + } + + free(se->virtio_dev->qi); ++ pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock); + free(se->virtio_dev); + se->virtio_dev = NULL; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch b/kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch new file mode 100755 index 0000000..5956dce --- /dev/null +++ b/kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch @@ -0,0 +1,314 @@ +From cc9a776fba8ec62c862db55753107f19459dafa8 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 9 Feb 2021 23:14:56 -0500 +Subject: [PATCH 3/3] virtiofsd: prevent opening of special files + (CVE-2020-35517) + +RH-Author: Jon Maloy +Message-id: <20210209231456.1555472-4-jmaloy@redhat.com> +Patchwork-id: 101023 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/3] virtiofsd: prevent opening of special files (CVE-2020-35517) +Bugzilla: 1919111 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Greg Kurz +RH-Acked-by: Dr. David Alan Gilbert + +From: Stefan Hajnoczi + +A well-behaved FUSE client does not attempt to open special files with +FUSE_OPEN because they are handled on the client side (e.g. device nodes +are handled by client-side device drivers). + +The check to prevent virtiofsd from opening special files is missing in +a few cases, most notably FUSE_OPEN. A malicious client can cause +virtiofsd to open a device node, potentially allowing the guest to +escape. This can be exploited by a modified guest device driver. It is +not exploitable from guest userspace since the guest kernel will handle +special files inside the guest instead of sending FUSE requests. + +This patch fixes this issue by introducing the lo_inode_open() function +to check the file type before opening it. This is a short-term solution +because it does not prevent a compromised virtiofsd process from opening +device nodes on the host. + +Restructure lo_create() to try O_CREAT | O_EXCL first. Note that O_CREAT +| O_EXCL does not follow symlinks, so O_NOFOLLOW masking is not +necessary here. If the file exists and the user did not specify O_EXCL, +open it via lo_do_open(). + +Reported-by: Alex Xu +Fixes: CVE-2020-35517 +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Vivek Goyal +Reviewed-by: Greg Kurz +Signed-off-by: Stefan Hajnoczi +Message-Id: <20210204150208.367837-4-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert + +(cherry picked from commit a3fdbbc7f271bff7d53d0501b29d910ece0b3789) +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + tools/virtiofsd/passthrough_ll.c | 144 ++++++++++++++++++++----------- + 1 file changed, 92 insertions(+), 52 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e5bd3d73e4..cb0992f2db 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -535,6 +535,38 @@ static int lo_fd(fuse_req_t req, fuse_ino_t ino) + return fd; + } + ++/* ++ * Open a file descriptor for an inode. Returns -EBADF if the inode is not a ++ * regular file or a directory. ++ * ++ * Use this helper function instead of raw openat(2) to prevent security issues ++ * when a malicious client opens special files such as block device nodes. ++ * Symlink inodes are also rejected since symlinks must already have been ++ * traversed on the client side. ++ */ ++static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode, ++ int open_flags) ++{ ++ g_autofree char *fd_str = g_strdup_printf("%d", inode->fd); ++ int fd; ++ ++ if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) { ++ return -EBADF; ++ } ++ ++ /* ++ * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier ++ * that the inode is not a special file but if an external process races ++ * with us then symlinks are traversed here. It is not possible to escape ++ * the shared directory since it is mounted as "/" though. ++ */ ++ fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW); ++ if (fd < 0) { ++ return -errno; ++ } ++ return fd; ++} ++ + static void lo_init(void *userdata, struct fuse_conn_info *conn) + { + struct lo_data *lo = (struct lo_data *)userdata; +@@ -788,9 +820,9 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + if (fi) { + truncfd = fd; + } else { +- sprintf(procname, "%i", ifd); +- truncfd = openat(lo->proc_self_fd, procname, O_RDWR); ++ truncfd = lo_inode_open(lo, inode, O_RDWR); + if (truncfd < 0) { ++ errno = -truncfd; + goto out_err; + } + } +@@ -894,7 +926,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_inode *dir = lo_inode(req, parent); + + if (inodep) { +- *inodep = NULL; ++ *inodep = NULL; /* in case there is an error */ + } + + /* +@@ -1725,19 +1757,26 @@ static void update_open_flags(int writeback, struct fuse_file_info *fi) + fi->flags &= ~O_DIRECT; + } + ++/* ++ * Open a regular file, set up an fd mapping, and fill out the struct ++ * fuse_file_info for it. If existing_fd is not negative, use that fd instead ++ * opening a new one. Takes ownership of existing_fd. ++ * ++ * Returns 0 on success or a positive errno. ++ */ + static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, +- struct fuse_file_info *fi) ++ int existing_fd, struct fuse_file_info *fi) + { +- char buf[64]; + ssize_t fh; +- int fd; ++ int fd = existing_fd; + + update_open_flags(lo->writeback, fi); + +- sprintf(buf, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); +- if (fd == -1) { +- return errno; ++ if (fd < 0) { ++ fd = lo_inode_open(lo, inode, fi->flags); ++ if (fd < 0) { ++ return -fd; ++ } + } + + pthread_mutex_lock(&lo->mutex); +@@ -1760,9 +1799,10 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) + { +- int fd; ++ int fd = -1; + struct lo_data *lo = lo_data(req); + struct lo_inode *parent_inode; ++ struct lo_inode *inode = NULL; + struct fuse_entry_param e; + int err; + struct lo_cred old = {}; +@@ -1788,36 +1828,38 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + + update_open_flags(lo->writeback, fi); + +- fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, +- mode); ++ /* Try to create a new file but don't open existing files */ ++ fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); + err = fd == -1 ? errno : 0; +- lo_restore_cred(&old); + +- if (!err) { +- ssize_t fh; ++ lo_restore_cred(&old); + +- pthread_mutex_lock(&lo->mutex); +- fh = lo_add_fd_mapping(lo, fd); +- pthread_mutex_unlock(&lo->mutex); +- if (fh == -1) { +- close(fd); +- err = ENOMEM; +- goto out; +- } ++ /* Ignore the error if file exists and O_EXCL was not given */ ++ if (err && (err != EEXIST || (fi->flags & O_EXCL))) { ++ goto out; ++ } + +- fi->fh = fh; +- err = lo_do_lookup(req, parent, name, &e, NULL); ++ err = lo_do_lookup(req, parent, name, &e, &inode); ++ if (err) { ++ goto out; + } +- if (lo->cache == CACHE_NONE) { +- fi->direct_io = 1; +- } else if (lo->cache == CACHE_ALWAYS) { +- fi->keep_cache = 1; ++ ++ err = lo_do_open(lo, inode, fd, fi); ++ fd = -1; /* lo_do_open() takes ownership of fd */ ++ if (err) { ++ /* Undo lo_do_lookup() nlookup ref */ ++ unref_inode_lolocked(lo, inode, 1); + } + + out: ++ lo_inode_put(lo, &inode); + lo_inode_put(lo, &parent_inode); + + if (err) { ++ if (fd >= 0) { ++ close(fd); ++ } ++ + fuse_reply_err(req, err); + } else { + fuse_reply_create(req, &e, fi); +@@ -1831,7 +1873,6 @@ static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, + pid_t pid, int *err) + { + struct lo_inode_plock *plock; +- char procname[64]; + int fd; + + plock = +@@ -1848,12 +1889,10 @@ static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, + } + + /* Open another instance of file which can be used for ofd locks. */ +- sprintf(procname, "%i", inode->fd); +- + /* TODO: What if file is not writable? */ +- fd = openat(lo->proc_self_fd, procname, O_RDWR); +- if (fd == -1) { +- *err = errno; ++ fd = lo_inode_open(lo, inode, O_RDWR); ++ if (fd < 0) { ++ *err = -fd; + free(plock); + return NULL; + } +@@ -2000,7 +2039,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + return; + } + +- err = lo_do_open(lo, inode, fi); ++ err = lo_do_open(lo, inode, -1, fi); + lo_inode_put(lo, &inode); + if (err) { + fuse_reply_err(req, err); +@@ -2056,39 +2095,40 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { ++ struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_data *lo = lo_data(req); + int res; + int fd; +- char *buf; + + fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, + (void *)fi); + +- if (!fi) { +- struct lo_data *lo = lo_data(req); +- +- res = asprintf(&buf, "%i", lo_fd(req, ino)); +- if (res == -1) { +- return (void)fuse_reply_err(req, errno); +- } ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } + +- fd = openat(lo->proc_self_fd, buf, O_RDWR); +- free(buf); +- if (fd == -1) { +- return (void)fuse_reply_err(req, errno); ++ if (!fi) { ++ fd = lo_inode_open(lo, inode, O_RDWR); ++ if (fd < 0) { ++ res = -fd; ++ goto out; + } + } else { + fd = lo_fi_fd(req, fi); + } + + if (datasync) { +- res = fdatasync(fd); ++ res = fdatasync(fd) == -1 ? errno : 0; + } else { +- res = fsync(fd); ++ res = fsync(fd) == -1 ? errno : 0; + } + if (!fi) { + close(fd); + } +- fuse_reply_err(req, res == -1 ? errno : 0); ++out: ++ lo_inode_put(lo, &inode); ++ fuse_reply_err(req, res); + } + + static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, +-- +2.18.2 + diff --git a/kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch b/kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch new file mode 100755 index 0000000..acafa41 --- /dev/null +++ b/kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch @@ -0,0 +1,147 @@ +From 2e58ff6978f8433fc8672d2e357c6f0f5f36d24f Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:07 +0100 +Subject: [PATCH 096/116] virtiofsd: prevent races with lo_dirp_put() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-93-dgilbert@redhat.com> +Patchwork-id: 93546 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 092/112] virtiofsd: prevent races with lo_dirp_put() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Introduce lo_dirp_put() so that FUSE_RELEASEDIR does not cause +use-after-free races with other threads that are accessing lo_dirp. + +Also make lo_releasedir() atomic to prevent FUSE_RELEASEDIR racing with +itself. This prevents double-frees. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit acefdde73b403576a241ebd8dbe8431ddc0d9442) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 41 ++++++++++++++++++++++++++++++++++------ + 1 file changed, 35 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 690edbc..2d703b5 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1284,11 +1284,28 @@ static void lo_readlink(fuse_req_t req, fuse_ino_t ino) + } + + struct lo_dirp { ++ gint refcount; + DIR *dp; + struct dirent *entry; + off_t offset; + }; + ++static void lo_dirp_put(struct lo_dirp **dp) ++{ ++ struct lo_dirp *d = *dp; ++ ++ if (!d) { ++ return; ++ } ++ *dp = NULL; ++ ++ if (g_atomic_int_dec_and_test(&d->refcount)) { ++ closedir(d->dp); ++ free(d); ++ } ++} ++ ++/* Call lo_dirp_put() on the return value when no longer needed */ + static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) + { + struct lo_data *lo = lo_data(req); +@@ -1296,6 +1313,9 @@ static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->dirp_map, fi->fh); ++ if (elem) { ++ g_atomic_int_inc(&elem->dirp->refcount); ++ } + pthread_mutex_unlock(&lo->mutex); + if (!elem) { + return NULL; +@@ -1331,6 +1351,7 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + d->offset = 0; + d->entry = NULL; + ++ g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */ + pthread_mutex_lock(&lo->mutex); + fh = lo_add_dirp_mapping(req, d); + pthread_mutex_unlock(&lo->mutex); +@@ -1364,7 +1385,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { + struct lo_data *lo = lo_data(req); +- struct lo_dirp *d; ++ struct lo_dirp *d = NULL; + struct lo_inode *dinode; + char *buf = NULL; + char *p; +@@ -1454,6 +1475,8 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + err = 0; + error: ++ lo_dirp_put(&d); ++ + /* + * If there's an error, we can only signal it if we haven't stored + * any entries yet - otherwise we'd end up with wrong lookup +@@ -1484,22 +1507,25 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { + struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; + struct lo_dirp *d; + + (void)ino; + +- d = lo_dirp(req, fi); +- if (!d) { ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->dirp_map, fi->fh); ++ if (!elem) { ++ pthread_mutex_unlock(&lo->mutex); + fuse_reply_err(req, EBADF); + return; + } + +- pthread_mutex_lock(&lo->mutex); ++ d = elem->dirp; + lo_map_remove(&lo->dirp_map, fi->fh); + pthread_mutex_unlock(&lo->mutex); + +- closedir(d->dp); +- free(d); ++ lo_dirp_put(&d); /* paired with lo_opendir() */ ++ + fuse_reply_err(req, 0); + } + +@@ -1710,6 +1736,9 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + } else { + res = fsync(fd); + } ++ ++ lo_dirp_put(&d); ++ + fuse_reply_err(req, res == -1 ? errno : 0); + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch b/kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch new file mode 100755 index 0000000..056559d --- /dev/null +++ b/kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch @@ -0,0 +1,469 @@ +From 5c9bbd00e8f8c944d9e8e22e7d1cf08cb8fddd6b Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:37 +0100 +Subject: [PATCH 066/116] virtiofsd: print log only when priority is high + enough +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-63-dgilbert@redhat.com> +Patchwork-id: 93518 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 062/112] virtiofsd: print log only when priority is high enough +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eryu Guan + +Introduce "-o log_level=" command line option to specify current log +level (priority), valid values are "debug info warn err", e.g. + + ./virtiofsd -o log_level=debug ... + +So only log priority higher than "debug" will be printed to +stderr/syslog. And the default level is info. + +The "-o debug"/"-d" options are kept, and imply debug log level. + +Signed-off-by: Eryu Guan +dgilbert: Reworked for libfuse's log_func +Signed-off-by: Dr. David Alan Gilbert +with fix by: +Signed-off-by: Xiao Yang +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d240314a1a18a1d914af1b5763fe8c9a572e6409) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 75 ++++++++++--------------- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 8 ++- + tools/virtiofsd/passthrough_ll.c | 118 ++++++++++++++++----------------------- + 4 files changed, 87 insertions(+), 115 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 6ceb33d..a7a1968 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -158,19 +158,17 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct fuse_out_header *out = iov[0].iov_base; + + out->len = iov_length(iov, count); +- if (se->debug) { +- if (out->unique == 0) { +- fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, +- out->len); +- } else if (out->error) { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, error: %i (%s), outsize: %i\n", +- (unsigned long long)out->unique, out->error, +- strerror(-out->error), out->len); +- } else { +- fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", +- (unsigned long long)out->unique, out->len); +- } ++ if (out->unique == 0) { ++ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, ++ out->len); ++ } else if (out->error) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, error: %i (%s), outsize: %i\n", ++ (unsigned long long)out->unique, out->error, ++ strerror(-out->error), out->len); ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", ++ (unsigned long long)out->unique, out->len); + } + + if (fuse_lowlevel_is_virtio(se)) { +@@ -1662,10 +1660,8 @@ static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, + return; + } + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", +- (unsigned long long)arg->unique); +- } ++ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", ++ (unsigned long long)arg->unique); + + req->u.i.unique = arg->unique; + +@@ -1901,13 +1897,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + } + } + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); +- if (arg->major == 7 && arg->minor >= 6) { +- fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); +- fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", +- arg->max_readahead); +- } ++ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); ++ if (arg->major == 7 && arg->minor >= 6) { ++ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); ++ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead); + } + se->conn.proto_major = arg->major; + se->conn.proto_minor = arg->minor; +@@ -2116,19 +2109,14 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + outarg.congestion_threshold = se->conn.congestion_threshold; + outarg.time_gran = se->conn.time_gran; + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, +- outarg.minor); +- fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); +- fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", +- outarg.max_readahead); +- fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); +- fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", +- outarg.max_background); +- fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", +- outarg.congestion_threshold); +- fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); +- } ++ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); ++ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); ++ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); ++ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); ++ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background); ++ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", ++ outarg.congestion_threshold); ++ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); + + send_reply_ok(req, &outarg, outargsize); + } +@@ -2407,14 +2395,11 @@ void fuse_session_process_buf_int(struct fuse_session *se, + in = fuse_mbuf_iter_advance(&iter, sizeof(*in)); + assert(in); /* caller guarantees the input buffer is large enough */ + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, +- "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, " +- "pid: %u\n", +- (unsigned long long)in->unique, +- opname((enum fuse_opcode)in->opcode), in->opcode, +- (unsigned long long)in->nodeid, buf->size, in->pid); +- } ++ fuse_log( ++ FUSE_LOG_DEBUG, ++ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", ++ (unsigned long long)in->unique, opname((enum fuse_opcode)in->opcode), ++ in->opcode, (unsigned long long)in->nodeid, buf->size, in->pid); + + req = fuse_ll_alloc_req(se); + if (req == NULL) { +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index f2750bc..138041e 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1796,6 +1796,7 @@ struct fuse_cmdline_opts { + int show_help; + int print_capabilities; + int syslog; ++ int log_level; + unsigned int max_idle_threads; + }; + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 9692ef9..6d50a46 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -34,7 +34,6 @@ + t, offsetof(struct fuse_cmdline_opts, p), v \ + } + +- + static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("-h", show_help), + FUSE_HELPER_OPT("--help", show_help), +@@ -55,6 +54,10 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_HELPER_OPT("--syslog", syslog), ++ FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), ++ FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), ++ FUSE_HELPER_OPT_VALUE("log_level=warn", log_level, FUSE_LOG_WARNING), ++ FUSE_HELPER_OPT_VALUE("log_level=err", log_level, FUSE_LOG_ERR), + FUSE_OPT_END + }; + +@@ -142,6 +145,9 @@ void fuse_cmdline_help(void) + " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" ++ " -o log_level= log level, default to \"info\"\n" ++ " level could be one of \"debug, " ++ "info, warn, err\"\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" + " allowed (default: 10)\n" +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 0372aca..ff6910f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -37,6 +37,7 @@ + + #include "qemu/osdep.h" + #include "fuse_virtio.h" ++#include "fuse_log.h" + #include "fuse_lowlevel.h" + #include + #include +@@ -140,6 +141,7 @@ static const struct fuse_opt lo_opts[] = { + FUSE_OPT_END + }; + static bool use_syslog = false; ++static int current_log_level; + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + +@@ -458,11 +460,6 @@ static int lo_fd(fuse_req_t req, fuse_ino_t ino) + return inode ? inode->fd : -1; + } + +-static bool lo_debug(fuse_req_t req) +-{ +- return lo_data(req)->debug != 0; +-} +- + static void lo_init(void *userdata, struct fuse_conn_info *conn) + { + struct lo_data *lo = (struct lo_data *)userdata; +@@ -472,15 +469,11 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + } + + if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { +- if (lo->debug) { +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); + conn->want |= FUSE_CAP_WRITEBACK_CACHE; + } + if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { +- if (lo->debug) { +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } + } +@@ -823,10 +816,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + } + e->ino = inode->fuse_ino; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long)parent, name, (unsigned long long)e->ino); +- } ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, ++ name, (unsigned long long)e->ino); + + return 0; + +@@ -843,10 +834,8 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + struct fuse_entry_param e; + int err; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent, ++ name); + + /* + * Don't use is_safe_path_component(), allow "." and ".." for NFS export +@@ -971,10 +960,8 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long)parent, name, (unsigned long long)e.ino); +- } ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, ++ name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + return; +@@ -1074,10 +1061,8 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + pthread_mutex_unlock(&lo->mutex); + e.ino = inode->fuse_ino; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long)parent, name, (unsigned long long)e.ino); +- } ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, ++ name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + return; +@@ -1171,11 +1156,9 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + return; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +- (unsigned long long)ino, (unsigned long long)inode->refcount, +- (unsigned long long)nlookup); +- } ++ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", ++ (unsigned long long)ino, (unsigned long long)inode->refcount, ++ (unsigned long long)nlookup); + + unref_inode(lo, inode, nlookup); + } +@@ -1445,10 +1428,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + int err; + struct lo_cred old = {}; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent, ++ name); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); +@@ -1525,10 +1506,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + char buf[64]; + struct lo_data *lo = lo_data(req); + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, +- fi->flags); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, ++ fi->flags); + + /* + * With writeback cache, kernel may send read requests even +@@ -1644,12 +1623,10 @@ static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + { + struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_read(ino=%" PRIu64 ", size=%zd, " +- "off=%lu)\n", +- ino, size, (unsigned long)offset); +- } ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_read(ino=%" PRIu64 ", size=%zd, " ++ "off=%lu)\n", ++ ino, size, (unsigned long)offset); + + buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + buf.buf[0].fd = lo_fi_fd(req, fi); +@@ -1671,11 +1648,9 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + out_buf.buf[0].fd = lo_fi_fd(req, fi); + out_buf.buf[0].pos = off; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, +- out_buf.buf[0].size, (unsigned long)off); +- } ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, ++ out_buf.buf[0].size, (unsigned long)off); + + /* + * If kill_priv is set, drop CAP_FSETID which should lead to kernel +@@ -1774,11 +1749,8 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ino, name, +- size); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ++ ino, name, size); + + if (inode->is_symlink) { + /* Sorry, no race free way to getxattr on symlink. */ +@@ -1852,10 +1824,8 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", +- ino, size); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, ++ size); + + if (inode->is_symlink) { + /* Sorry, no race free way to listxattr on symlink. */ +@@ -1929,11 +1899,8 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", +- ino, name, value, size); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 ++ ", name=%s value=%s size=%zd)\n", ino, name, value, size); + + if (inode->is_symlink) { + /* Sorry, no race free way to setxattr on symlink. */ +@@ -1978,10 +1945,8 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", +- ino, name); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, ++ name); + + if (inode->is_symlink) { + /* Sorry, no race free way to setxattr on symlink. */ +@@ -2303,6 +2268,10 @@ static void setup_nofile_rlimit(void) + + static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + { ++ if (current_log_level < level) { ++ return; ++ } ++ + if (use_syslog) { + int priority = LOG_ERR; + switch (level) { +@@ -2401,8 +2370,19 @@ int main(int argc, char *argv[]) + return 1; + } + ++ /* ++ * log_level is 0 if not configured via cmd options (0 is LOG_EMERG, ++ * and we don't use this log level). ++ */ ++ if (opts.log_level != 0) { ++ current_log_level = opts.log_level; ++ } + lo.debug = opts.debug; ++ if (lo.debug) { ++ current_log_level = FUSE_LOG_DEBUG; ++ } + lo.root.refcount = 2; ++ + if (lo.source) { + struct stat stat; + int res; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-process-requests-in-a-thread-pool.patch b/kvm-virtiofsd-process-requests-in-a-thread-pool.patch new file mode 100755 index 0000000..87fff99 --- /dev/null +++ b/kvm-virtiofsd-process-requests-in-a-thread-pool.patch @@ -0,0 +1,533 @@ +From b0db5e666aaa43eadff3e60a1ada704f33b03074 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:19 +0100 +Subject: [PATCH 108/116] virtiofsd: process requests in a thread pool +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-105-dgilbert@redhat.com> +Patchwork-id: 93554 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 104/112] virtiofsd: process requests in a thread pool +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Introduce a thread pool so that fv_queue_thread() just pops +VuVirtqElements and hands them to the thread pool. For the time being +only one worker thread is allowed since passthrough_ll.c is not +thread-safe yet. Future patches will lift this restriction so that +multiple FUSE requests can be processed in parallel. + +The main new concept is struct FVRequest, which contains both +VuVirtqElement and struct fuse_chan. We now have fv_VuDev for a device, +fv_QueueInfo for a virtqueue, and FVRequest for a request. Some of +fv_QueueInfo's fields are moved into FVRequest because they are +per-request. The name FVRequest conforms to QEMU coding style and I +expect the struct fv_* types will be renamed in a future refactoring. + +This patch series is not optimal. fbuf reuse is dropped so each request +does malloc(se->bufsize), but there is no clean and cheap way to keep +this with a thread pool. The vq_lock mutex is held for longer than +necessary, especially during the eventfd_write() syscall. Performance +can be improved in the future. + +prctl(2) had to be added to the seccomp whitelist because glib invokes +it. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a3d756c5aecccc4c0e51060a7e2f1c87bf8f1180) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 359 +++++++++++++++++++++++------------------- + 1 file changed, 201 insertions(+), 158 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index f6242f9..0dcf2ef 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -22,6 +22,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -37,17 +38,28 @@ + struct fv_VuDev; + struct fv_QueueInfo { + pthread_t thread; ++ /* ++ * This lock protects the VuVirtq preventing races between ++ * fv_queue_thread() and fv_queue_worker(). ++ */ ++ pthread_mutex_t vq_lock; ++ + struct fv_VuDev *virtio_dev; + + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; + int kill_fd; /* For killing the thread */ ++}; + +- /* The element for the command currently being processed */ +- VuVirtqElement *qe; ++/* A FUSE request */ ++typedef struct { ++ VuVirtqElement elem; ++ struct fuse_chan ch; ++ ++ /* Used to complete requests that involve no reply */ + bool reply_sent; +-}; ++} FVRequest; + + /* + * We pass the dev element into libvhost-user +@@ -191,8 +203,11 @@ static void copy_iov(struct iovec *src_iov, int src_count, + int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count) + { +- VuVirtqElement *elem; +- VuVirtq *q; ++ FVRequest *req = container_of(ch, FVRequest, ch); ++ struct fv_QueueInfo *qi = ch->qi; ++ VuDev *dev = &se->virtio_dev->dev; ++ VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ VuVirtqElement *elem = &req->elem; + int ret = 0; + + assert(count >= 1); +@@ -205,11 +220,7 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + + /* unique == 0 is notification, which we don't support */ + assert(out->unique); +- /* For virtio we always have ch */ +- assert(ch); +- assert(!ch->qi->reply_sent); +- elem = ch->qi->qe; +- q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; +@@ -236,9 +247,15 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + } + + copy_iov(iov, count, in_sg, in_num, tosend_len); +- vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); +- vu_queue_notify(&se->virtio_dev->dev, q); +- ch->qi->reply_sent = true; ++ ++ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ pthread_mutex_lock(&qi->vq_lock); ++ vu_queue_push(dev, q, elem, tosend_len); ++ vu_queue_notify(dev, q); ++ pthread_mutex_unlock(&qi->vq_lock); ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ ++ req->reply_sent = true; + + err: + return ret; +@@ -254,9 +271,12 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count, struct fuse_bufvec *buf, + size_t len) + { ++ FVRequest *req = container_of(ch, FVRequest, ch); ++ struct fv_QueueInfo *qi = ch->qi; ++ VuDev *dev = &se->virtio_dev->dev; ++ VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ VuVirtqElement *elem = &req->elem; + int ret = 0; +- VuVirtqElement *elem; +- VuVirtq *q; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); +@@ -275,11 +295,7 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + /* unique == 0 is notification which we don't support */ + assert(out->unique); + +- /* For virtio we always have ch */ +- assert(ch); +- assert(!ch->qi->reply_sent); +- elem = ch->qi->qe; +- q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; +@@ -395,33 +411,175 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + + ret = 0; + +- vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); +- vu_queue_notify(&se->virtio_dev->dev, q); ++ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ pthread_mutex_lock(&qi->vq_lock); ++ vu_queue_push(dev, q, elem, tosend_len); ++ vu_queue_notify(dev, q); ++ pthread_mutex_unlock(&qi->vq_lock); ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + + err: + if (ret == 0) { +- ch->qi->reply_sent = true; ++ req->reply_sent = true; + } + + return ret; + } + ++/* Process one FVRequest in a thread pool */ ++static void fv_queue_worker(gpointer data, gpointer user_data) ++{ ++ struct fv_QueueInfo *qi = user_data; ++ struct fuse_session *se = qi->virtio_dev->se; ++ struct VuDev *dev = &qi->virtio_dev->dev; ++ FVRequest *req = data; ++ VuVirtqElement *elem = &req->elem; ++ struct fuse_buf fbuf = {}; ++ bool allocated_bufv = false; ++ struct fuse_bufvec bufv; ++ struct fuse_bufvec *pbufv; ++ ++ assert(se->bufsize > sizeof(struct fuse_in_header)); ++ ++ /* ++ * An element contains one request and the space to send our response ++ * They're spread over multiple descriptors in a scatter/gather set ++ * and we can't trust the guest to keep them still; so copy in/out. ++ */ ++ fbuf.mem = malloc(se->bufsize); ++ assert(fbuf.mem); ++ ++ fuse_mutex_init(&req->ch.lock); ++ req->ch.fd = -1; ++ req->ch.qi = qi; ++ ++ /* The 'out' part of the elem is from qemu */ ++ unsigned int out_num = elem->out_num; ++ struct iovec *out_sg = elem->out_sg; ++ size_t out_len = iov_size(out_sg, out_num); ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: elem %d: with %d out desc of length %zd\n", ++ __func__, elem->index, out_num, out_len); ++ ++ /* ++ * The elem should contain a 'fuse_in_header' (in to fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (out_len < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", ++ __func__, elem->index); ++ assert(0); /* TODO */ ++ } ++ if (out_len > se->bufsize) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__, ++ elem->index); ++ assert(0); /* TODO */ ++ } ++ /* Copy just the first element and look at it */ ++ copy_from_iov(&fbuf, 1, out_sg); ++ ++ pbufv = NULL; /* Compiler thinks an unitialised path */ ++ if (out_num > 2 && ++ out_sg[0].iov_len == sizeof(struct fuse_in_header) && ++ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && ++ out_sg[1].iov_len == sizeof(struct fuse_write_in)) { ++ /* ++ * For a write we don't actually need to copy the ++ * data, we can just do it straight out of guest memory ++ * but we must still copy the headers in case the guest ++ * was nasty and changed them while we were using them. ++ */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); ++ ++ /* copy the fuse_write_in header afte rthe fuse_in_header */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; ++ ++ /* Allocate the bufv, with space for the rest of the iov */ ++ pbufv = malloc(sizeof(struct fuse_bufvec) + ++ sizeof(struct fuse_buf) * (out_num - 2)); ++ if (!pbufv) { ++ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", ++ __func__); ++ goto out; ++ } ++ ++ allocated_bufv = true; ++ pbufv->count = 1; ++ pbufv->buf[0] = fbuf; ++ ++ size_t iovindex, pbufvindex; ++ iovindex = 2; /* 2 headers, separate iovs */ ++ pbufvindex = 1; /* 2 headers, 1 fusebuf */ ++ ++ for (; iovindex < out_num; iovindex++, pbufvindex++) { ++ pbufv->count++; ++ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ ++ pbufv->buf[pbufvindex].flags = 0; ++ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; ++ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; ++ } ++ } else { ++ /* Normal (non fast write) path */ ++ ++ /* Copy the rest of the buffer */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, out_num - 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_len; ++ ++ /* TODO! Endianness of header */ ++ ++ /* TODO: Add checks for fuse_session_exited */ ++ bufv.buf[0] = fbuf; ++ bufv.count = 1; ++ pbufv = &bufv; ++ } ++ pbufv->idx = 0; ++ pbufv->off = 0; ++ fuse_session_process_buf_int(se, pbufv, &req->ch); ++ ++out: ++ if (allocated_bufv) { ++ free(pbufv); ++ } ++ ++ /* If the request has no reply, still recycle the virtqueue element */ ++ if (!req->reply_sent) { ++ struct VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__, ++ elem->index); ++ ++ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ pthread_mutex_lock(&qi->vq_lock); ++ vu_queue_push(dev, q, elem, 0); ++ vu_queue_notify(dev, q); ++ pthread_mutex_unlock(&qi->vq_lock); ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ } ++ ++ pthread_mutex_destroy(&req->ch.lock); ++ free(fbuf.mem); ++ free(req); ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { + struct fv_QueueInfo *qi = opaque; + struct VuDev *dev = &qi->virtio_dev->dev; + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); +- struct fuse_session *se = qi->virtio_dev->se; +- struct fuse_chan ch; +- struct fuse_buf fbuf; ++ GThreadPool *pool; + +- fbuf.mem = NULL; +- fbuf.flags = 0; +- +- fuse_mutex_init(&ch.lock); +- ch.fd = (int)0xdaff0d111; +- ch.qi = qi; ++ pool = g_thread_pool_new(fv_queue_worker, qi, 1 /* TODO max_threads */, ++ TRUE, NULL); ++ if (!pool) { ++ fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); ++ return NULL; ++ } + + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); +@@ -478,6 +636,7 @@ static void *fv_queue_thread(void *opaque) + /* Mutual exclusion with virtio_loop() */ + ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + assert(ret == 0); /* there is no possible error case */ ++ pthread_mutex_lock(&qi->vq_lock); + /* out is from guest, in is too guest */ + unsigned int in_bytes, out_bytes; + vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); +@@ -486,141 +645,22 @@ static void *fv_queue_thread(void *opaque) + "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + +- + while (1) { +- bool allocated_bufv = false; +- struct fuse_bufvec bufv; +- struct fuse_bufvec *pbufv; +- +- /* +- * An element contains one request and the space to send our +- * response They're spread over multiple descriptors in a +- * scatter/gather set and we can't trust the guest to keep them +- * still; so copy in/out. +- */ +- VuVirtqElement *elem = vu_queue_pop(dev, q, sizeof(VuVirtqElement)); +- if (!elem) { ++ FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest)); ++ if (!req) { + break; + } + +- qi->qe = elem; +- qi->reply_sent = false; ++ req->reply_sent = false; + +- if (!fbuf.mem) { +- fbuf.mem = malloc(se->bufsize); +- assert(fbuf.mem); +- assert(se->bufsize > sizeof(struct fuse_in_header)); +- } +- /* The 'out' part of the elem is from qemu */ +- unsigned int out_num = elem->out_num; +- struct iovec *out_sg = elem->out_sg; +- size_t out_len = iov_size(out_sg, out_num); +- fuse_log(FUSE_LOG_DEBUG, +- "%s: elem %d: with %d out desc of length %zd\n", __func__, +- elem->index, out_num, out_len); +- +- /* +- * The elem should contain a 'fuse_in_header' (in to fuse) +- * plus the data based on the len in the header. +- */ +- if (out_len < sizeof(struct fuse_in_header)) { +- fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", +- __func__, elem->index); +- assert(0); /* TODO */ +- } +- if (out_len > se->bufsize) { +- fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", +- __func__, elem->index); +- assert(0); /* TODO */ +- } +- /* Copy just the first element and look at it */ +- copy_from_iov(&fbuf, 1, out_sg); +- +- if (out_num > 2 && +- out_sg[0].iov_len == sizeof(struct fuse_in_header) && +- ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && +- out_sg[1].iov_len == sizeof(struct fuse_write_in)) { +- /* +- * For a write we don't actually need to copy the +- * data, we can just do it straight out of guest memory +- * but we must still copy the headers in case the guest +- * was nasty and changed them while we were using them. +- */ +- fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); +- +- /* copy the fuse_write_in header after the fuse_in_header */ +- fbuf.mem += out_sg->iov_len; +- copy_from_iov(&fbuf, 1, out_sg + 1); +- fbuf.mem -= out_sg->iov_len; +- fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; +- +- /* Allocate the bufv, with space for the rest of the iov */ +- allocated_bufv = true; +- pbufv = malloc(sizeof(struct fuse_bufvec) + +- sizeof(struct fuse_buf) * (out_num - 2)); +- if (!pbufv) { +- vu_queue_unpop(dev, q, elem, 0); +- free(elem); +- fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", +- __func__); +- goto out; +- } +- +- pbufv->count = 1; +- pbufv->buf[0] = fbuf; +- +- size_t iovindex, pbufvindex; +- iovindex = 2; /* 2 headers, separate iovs */ +- pbufvindex = 1; /* 2 headers, 1 fusebuf */ +- +- for (; iovindex < out_num; iovindex++, pbufvindex++) { +- pbufv->count++; +- pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ +- pbufv->buf[pbufvindex].flags = 0; +- pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; +- pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; +- } +- } else { +- /* Normal (non fast write) path */ +- +- /* Copy the rest of the buffer */ +- fbuf.mem += out_sg->iov_len; +- copy_from_iov(&fbuf, out_num - 1, out_sg + 1); +- fbuf.mem -= out_sg->iov_len; +- fbuf.size = out_len; +- +- /* TODO! Endianness of header */ +- +- /* TODO: Add checks for fuse_session_exited */ +- bufv.buf[0] = fbuf; +- bufv.count = 1; +- pbufv = &bufv; +- } +- pbufv->idx = 0; +- pbufv->off = 0; +- fuse_session_process_buf_int(se, pbufv, &ch); +- +- if (allocated_bufv) { +- free(pbufv); +- } +- +- if (!qi->reply_sent) { +- fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", +- __func__, elem->index); +- /* I think we've still got to recycle the element */ +- vu_queue_push(dev, q, elem, 0); +- vu_queue_notify(dev, q); +- } +- qi->qe = NULL; +- free(elem); +- elem = NULL; ++ g_thread_pool_push(pool, req, NULL); + } + ++ pthread_mutex_unlock(&qi->vq_lock); + pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + } +-out: +- pthread_mutex_destroy(&ch.lock); +- free(fbuf.mem); ++ ++ g_thread_pool_free(pool, FALSE, TRUE); + + return NULL; + } +@@ -643,6 +683,7 @@ static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) + fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n", + __func__, qidx, ret); + } ++ pthread_mutex_destroy(&ourqi->vq_lock); + close(ourqi->kill_fd); + ourqi->kick_fd = -1; + free(vud->qi[qidx]); +@@ -696,6 +737,8 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started) + + ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE); + assert(ourqi->kill_fd != -1); ++ pthread_mutex_init(&ourqi->vq_lock, NULL); ++ + if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", + __func__, qidx); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-remove-mountpoint-dummy-argument.patch b/kvm-virtiofsd-remove-mountpoint-dummy-argument.patch new file mode 100755 index 0000000..181e32d --- /dev/null +++ b/kvm-virtiofsd-remove-mountpoint-dummy-argument.patch @@ -0,0 +1,159 @@ +From a8a1835a82510be7d2d6edcc28a60e506a2cedad Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:46 +0100 +Subject: [PATCH 015/116] virtiofsd: remove mountpoint dummy argument +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-12-dgilbert@redhat.com> +Patchwork-id: 93466 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 011/112] virtiofsd: remove mountpoint dummy argument +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Classic FUSE file system daemons take a mountpoint argument but +virtiofsd exposes a vhost-user UNIX domain socket instead. The +mountpoint argument is not used by virtiofsd but the user is still +required to pass a dummy argument on the command-line. + +Remove the mountpoint argument to clean up the command-line. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 67aab02272f6cb47c56420f60b370c184961b5ca) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 2 +- + tools/virtiofsd/fuse_lowlevel.h | 4 +--- + tools/virtiofsd/helper.c | 20 +++----------------- + tools/virtiofsd/passthrough_ll.c | 12 ++---------- + 4 files changed, 7 insertions(+), 31 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 5c9cb52..2f32c68 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2455,7 +2455,7 @@ out1: + return NULL; + } + +-int fuse_session_mount(struct fuse_session *se, const char *mountpoint) ++int fuse_session_mount(struct fuse_session *se) + { + int fd; + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index adb9054..8d8909b 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1863,7 +1863,6 @@ struct fuse_cmdline_opts { + int foreground; + int debug; + int nodefault_subtype; +- char *mountpoint; + int show_version; + int show_help; + unsigned int max_idle_threads; +@@ -1924,12 +1923,11 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + /** + * Mount a FUSE file system. + * +- * @param mountpoint the mount point path + * @param se session object + * + * @return 0 on success, -1 on failure. + **/ +-int fuse_session_mount(struct fuse_session *se, const char *mountpoint); ++int fuse_session_mount(struct fuse_session *se); + + /** + * Enter a single threaded, blocking event loop. +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5711dd2..5e6f205 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -140,27 +140,13 @@ void fuse_cmdline_help(void) + static int fuse_helper_opt_proc(void *data, const char *arg, int key, + struct fuse_args *outargs) + { ++ (void)data; + (void)outargs; +- struct fuse_cmdline_opts *opts = data; + + switch (key) { + case FUSE_OPT_KEY_NONOPT: +- if (!opts->mountpoint) { +- if (fuse_mnt_parse_fuse_fd(arg) != -1) { +- return fuse_opt_add_opt(&opts->mountpoint, arg); +- } +- +- char mountpoint[PATH_MAX] = ""; +- if (realpath(arg, mountpoint) == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: bad mount point `%s': %s\n", arg, +- strerror(errno)); +- return -1; +- } +- return fuse_opt_add_opt(&opts->mountpoint, mountpoint); +- } else { +- fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); +- return -1; +- } ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); ++ return -1; + + default: + /* Pass through unknown options */ +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c5850ef..9377718 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1297,7 +1297,7 @@ int main(int argc, char *argv[]) + return 1; + } + if (opts.show_help) { +- printf("usage: %s [options] \n\n", argv[0]); ++ printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); + fuse_lowlevel_help(); + ret = 0; +@@ -1308,13 +1308,6 @@ int main(int argc, char *argv[]) + goto err_out1; + } + +- if (opts.mountpoint == NULL) { +- printf("usage: %s [options] \n", argv[0]); +- printf(" %s --help\n", argv[0]); +- ret = 1; +- goto err_out1; +- } +- + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { + return 1; + } +@@ -1374,7 +1367,7 @@ int main(int argc, char *argv[]) + goto err_out2; + } + +- if (fuse_session_mount(se, opts.mountpoint) != 0) { ++ if (fuse_session_mount(se) != 0) { + goto err_out3; + } + +@@ -1393,7 +1386,6 @@ err_out3: + err_out2: + fuse_session_destroy(se); + err_out1: +- free(opts.mountpoint); + fuse_opt_free_args(&args); + + if (lo.root.fd >= 0) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-remove-unused-notify-reply-support.patch b/kvm-virtiofsd-remove-unused-notify-reply-support.patch new file mode 100755 index 0000000..98fb968 --- /dev/null +++ b/kvm-virtiofsd-remove-unused-notify-reply-support.patch @@ -0,0 +1,294 @@ +From e5534c0d4b866f61dbafa8d2422a24ab956189c1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:47 +0100 +Subject: [PATCH 016/116] virtiofsd: remove unused notify reply support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-13-dgilbert@redhat.com> +Patchwork-id: 93467 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 012/112] virtiofsd: remove unused notify reply support +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Notify reply support is unused by virtiofsd. The code would need to be +updated to validate input buffer sizes. Remove this unused code since +changes to it are untestable. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 64c6f408a29ef03e9b8da9f5a5d8fd511b0d801e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 147 +--------------------------------------- + tools/virtiofsd/fuse_lowlevel.h | 47 ------------- + 2 files changed, 1 insertion(+), 193 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 2f32c68..eb0ec49 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -31,12 +31,6 @@ + #define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + #define OFFSET_MAX 0x7fffffffffffffffLL + +-#define container_of(ptr, type, member) \ +- ({ \ +- const typeof(((type *)0)->member) *__mptr = (ptr); \ +- (type *)((char *)__mptr - offsetof(type, member)); \ +- }) +- + struct fuse_pollhandle { + uint64_t kh; + struct fuse_session *se; +@@ -1862,52 +1856,6 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + send_reply_ok(req, NULL, 0); + } + +-static void list_del_nreq(struct fuse_notify_req *nreq) +-{ +- struct fuse_notify_req *prev = nreq->prev; +- struct fuse_notify_req *next = nreq->next; +- prev->next = next; +- next->prev = prev; +-} +- +-static void list_add_nreq(struct fuse_notify_req *nreq, +- struct fuse_notify_req *next) +-{ +- struct fuse_notify_req *prev = next->prev; +- nreq->next = next; +- nreq->prev = prev; +- prev->next = nreq; +- next->prev = nreq; +-} +- +-static void list_init_nreq(struct fuse_notify_req *nreq) +-{ +- nreq->next = nreq; +- nreq->prev = nreq; +-} +- +-static void do_notify_reply(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, const struct fuse_buf *buf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_notify_req *nreq; +- struct fuse_notify_req *head; +- +- pthread_mutex_lock(&se->lock); +- head = &se->notify_list; +- for (nreq = head->next; nreq != head; nreq = nreq->next) { +- if (nreq->unique == req->unique) { +- list_del_nreq(nreq); +- break; +- } +- } +- pthread_mutex_unlock(&se->lock); +- +- if (nreq != head) { +- nreq->reply(nreq, req, nodeid, inarg, buf); +- } +-} +- + static int send_notify_iov(struct fuse_session *se, int notify_code, + struct iovec *iov, int count) + { +@@ -2059,95 +2007,6 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + return res; + } + +-struct fuse_retrieve_req { +- struct fuse_notify_req nreq; +- void *cookie; +-}; +- +-static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, fuse_req_t req, +- fuse_ino_t ino, const void *inarg, +- const struct fuse_buf *ibuf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_retrieve_req *rreq = +- container_of(nreq, struct fuse_retrieve_req, nreq); +- const struct fuse_notify_retrieve_in *arg = inarg; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, +- .count = 1, +- }; +- +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- bufv.buf[0].mem = PARAM(arg); +- } +- +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_notify_retrieve_in); +- +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); +- fuse_reply_none(req); +- goto out; +- } +- bufv.buf[0].size = arg->size; +- +- if (se->op.retrieve_reply) { +- se->op.retrieve_reply(req, rreq->cookie, ino, arg->offset, &bufv); +- } else { +- fuse_reply_none(req); +- } +-out: +- free(rreq); +-} +- +-int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie) +-{ +- struct fuse_notify_retrieve_out outarg; +- struct iovec iov[2]; +- struct fuse_retrieve_req *rreq; +- int err; +- +- if (!se) { +- return -EINVAL; +- } +- +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { +- return -ENOSYS; +- } +- +- rreq = malloc(sizeof(*rreq)); +- if (rreq == NULL) { +- return -ENOMEM; +- } +- +- pthread_mutex_lock(&se->lock); +- rreq->cookie = cookie; +- rreq->nreq.unique = se->notify_ctr++; +- rreq->nreq.reply = fuse_ll_retrieve_reply; +- list_add_nreq(&rreq->nreq, &se->notify_list); +- pthread_mutex_unlock(&se->lock); +- +- outarg.notify_unique = rreq->nreq.unique; +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; +- +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); +- +- err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); +- if (err) { +- pthread_mutex_lock(&se->lock); +- list_del_nreq(&rreq->nreq); +- pthread_mutex_unlock(&se->lock); +- free(rreq); +- } +- +- return err; +-} +- + void *fuse_req_userdata(fuse_req_t req) + { + return req->se->userdata; +@@ -2226,7 +2085,7 @@ static struct { + [FUSE_POLL] = { do_poll, "POLL" }, + [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, + [FUSE_DESTROY] = { do_destroy, "DESTROY" }, +- [FUSE_NOTIFY_REPLY] = { (void *)1, "NOTIFY_REPLY" }, ++ [FUSE_NOTIFY_REPLY] = { NULL, "NOTIFY_REPLY" }, + [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, + [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" }, + [FUSE_RENAME2] = { do_rename2, "RENAME2" }, +@@ -2333,8 +2192,6 @@ void fuse_session_process_buf_int(struct fuse_session *se, + inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { + do_write_buf(req, in->nodeid, inarg, buf); +- } else if (in->opcode == FUSE_NOTIFY_REPLY) { +- do_notify_reply(req, in->nodeid, inarg, buf); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + } +@@ -2437,8 +2294,6 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + + list_init_req(&se->list); + list_init_req(&se->interrupts); +- list_init_nreq(&se->notify_list); +- se->notify_ctr = 1; + fuse_mutex_init(&se->lock); + + memcpy(&se->op, op, op_size); +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 8d8909b..12a84b4 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1085,21 +1085,6 @@ struct fuse_lowlevel_ops { + off_t off, struct fuse_file_info *fi); + + /** +- * Callback function for the retrieve request +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() +- * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() +- * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() +- * @param bufv the buffer containing the returned data +- */ +- void (*retrieve_reply)(fuse_req_t req, void *cookie, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv); +- +- /** + * Forget about multiple inodes + * + * See description of the forget function for more +@@ -1726,38 +1711,6 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv, + enum fuse_buf_copy_flags flags); +-/** +- * Retrieve data from the kernel buffers +- * +- * Retrieve data in the kernel buffers belonging to the given inode. +- * If successful then the retrieve_reply() method will be called with +- * the returned data. +- * +- * Only present pages are returned in the retrieve reply. Retrieving +- * stops when it finds a non-present page and only data prior to that +- * is returned. +- * +- * If this function returns an error, then the retrieve will not be +- * completed and no reply will be sent. +- * +- * This function doesn't change the dirty state of pages in the kernel +- * buffer. For dirty pages the write() method will be called +- * regardless of having been retrieved previously. +- * +- * Added in FUSE protocol version 7.15. If the kernel does not support +- * this (or a newer) version, the function will return -ENOSYS and do +- * nothing. +- * +- * @param se the session object +- * @param ino the inode number +- * @param size the number of bytes to retrieve +- * @param offset the starting offset into the file to retrieve from +- * @param cookie user data to supply to the reply callback +- * @return zero for success, -errno for failure +- */ +-int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie); +- + + /* + * Utility functions +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch b/kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch new file mode 100755 index 0000000..97a0db3 --- /dev/null +++ b/kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch @@ -0,0 +1,139 @@ +From e01a6e68d799ed2af0ca3b04d75818ba62b18682 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:08 +0100 +Subject: [PATCH 097/116] virtiofsd: rename inode->refcount to inode->nlookup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-94-dgilbert@redhat.com> +Patchwork-id: 93547 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 093/112] virtiofsd: rename inode->refcount to inode->nlookup +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +This reference counter plays a specific role in the FUSE protocol. It's +not a generic object reference counter and the FUSE kernel code calls it +"nlookup". + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 1222f015558fc34cea02aa3a5a92de608c82cec8) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 37 +++++++++++++++++++++++++------------ + 1 file changed, 25 insertions(+), 12 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 2d703b5..c819b5f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -99,7 +99,20 @@ struct lo_inode { + int fd; + bool is_symlink; + struct lo_key key; +- uint64_t refcount; /* protected by lo->mutex */ ++ ++ /* ++ * This counter keeps the inode alive during the FUSE session. ++ * Incremented when the FUSE inode number is sent in a reply ++ * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is ++ * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc. ++ * ++ * Note that this value is untrusted because the client can manipulate ++ * it arbitrarily using FUSE_FORGET requests. ++ * ++ * Protected by lo->mutex. ++ */ ++ uint64_t nlookup; ++ + fuse_ino_t fuse_ino; + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ +@@ -568,7 +581,7 @@ retry: + if (last == path) { + p = &lo->root; + pthread_mutex_lock(&lo->mutex); +- p->refcount++; ++ p->nlookup++; + pthread_mutex_unlock(&lo->mutex); + } else { + *last = '\0'; +@@ -786,8 +799,8 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + pthread_mutex_lock(&lo->mutex); + p = g_hash_table_lookup(lo->inodes, &key); + if (p) { +- assert(p->refcount > 0); +- p->refcount++; ++ assert(p->nlookup > 0); ++ p->nlookup++; + } + pthread_mutex_unlock(&lo->mutex); + +@@ -855,7 +868,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + inode->is_symlink = S_ISLNK(e->attr.st_mode); +- inode->refcount = 1; ++ inode->nlookup = 1; + inode->fd = newfd; + newfd = -1; + inode->key.ino = e->attr.st_ino; +@@ -1112,7 +1125,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + } + + pthread_mutex_lock(&lo->mutex); +- inode->refcount++; ++ inode->nlookup++; + pthread_mutex_unlock(&lo->mutex); + e.ino = inode->fuse_ino; + +@@ -1193,9 +1206,9 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + + pthread_mutex_lock(&lo->mutex); +- assert(inode->refcount >= n); +- inode->refcount -= n; +- if (!inode->refcount) { ++ assert(inode->nlookup >= n); ++ inode->nlookup -= n; ++ if (!inode->nlookup) { + lo_map_remove(&lo->ino_map, inode->fuse_ino); + g_hash_table_remove(lo->inodes, &inode->key); + if (g_hash_table_size(inode->posix_locks)) { +@@ -1216,7 +1229,7 @@ static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) + struct lo_inode *inode = value; + struct lo_data *lo = user_data; + +- inode->refcount = 0; ++ inode->nlookup = 0; + lo_map_remove(&lo->ino_map, inode->fuse_ino); + close(inode->fd); + +@@ -1241,7 +1254,7 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + } + + fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +- (unsigned long long)ino, (unsigned long long)inode->refcount, ++ (unsigned long long)ino, (unsigned long long)inode->nlookup, + (unsigned long long)nlookup); + + unref_inode_lolocked(lo, inode, nlookup); +@@ -2609,7 +2622,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + root->fd = fd; + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; +- root->refcount = 2; ++ root->nlookup = 2; + } + + static guint lo_key_hash(gconstpointer key) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch b/kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch new file mode 100755 index 0000000..95858f8 --- /dev/null +++ b/kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch @@ -0,0 +1,94 @@ +From cfa4550f926e7a07757853f94273f2d1589cb9d3 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:48 +0100 +Subject: [PATCH 077/116] virtiofsd: rename unref_inode() to + unref_inode_lolocked() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-74-dgilbert@redhat.com> +Patchwork-id: 93526 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 073/112] virtiofsd: rename unref_inode() to unref_inode_lolocked() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 95d2715791c60b5dc2d22e4eb7b83217273296fa) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 8b1784f..de12e75 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -148,8 +148,8 @@ static const struct fuse_opt lo_opts[] = { + }; + static bool use_syslog = false; + static int current_log_level; +- +-static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); ++static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, ++ uint64_t n); + + static struct { + pthread_mutex_t mutex; +@@ -586,7 +586,7 @@ retry: + return 0; + + fail_unref: +- unref_inode(lo, p, 1); ++ unref_inode_lolocked(lo, p, 1); + fail: + if (retries) { + retries--; +@@ -624,7 +624,7 @@ fallback: + res = lo_parent_and_name(lo, inode, path, &parent); + if (res != -1) { + res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); +- unref_inode(lo, parent, 1); ++ unref_inode_lolocked(lo, parent, 1); + } + + return res; +@@ -1027,7 +1027,7 @@ fallback: + res = lo_parent_and_name(lo, inode, path, &parent); + if (res != -1) { + res = linkat(parent->fd, path, dfd, name, 0); +- unref_inode(lo, parent, 1); ++ unref_inode_lolocked(lo, parent, 1); + } + + return res; +@@ -1141,7 +1141,8 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + fuse_reply_err(req, res == -1 ? errno : 0); + } + +-static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) ++static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, ++ uint64_t n) + { + if (!inode) { + return; +@@ -1181,7 +1182,7 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + (unsigned long long)ino, (unsigned long long)inode->refcount, + (unsigned long long)nlookup); + +- unref_inode(lo, inode, nlookup); ++ unref_inode_lolocked(lo, inode, nlookup); + } + + static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-sandbox-mount-namespace.patch b/kvm-virtiofsd-sandbox-mount-namespace.patch new file mode 100755 index 0000000..ab6f751 --- /dev/null +++ b/kvm-virtiofsd-sandbox-mount-namespace.patch @@ -0,0 +1,166 @@ +From c7ae38df696e4be432fd418c670dcea892b910a7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:27 +0100 +Subject: [PATCH 056/116] virtiofsd: sandbox mount namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-53-dgilbert@redhat.com> +Patchwork-id: 93504 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 052/112] virtiofsd: sandbox mount namespace +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Use a mount namespace with the shared directory tree mounted at "/" and +no other mounts. + +This prevents symlink escape attacks because symlink targets are +resolved only against the shared directory and cannot go outside it. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Peng Tao +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5baa3b8e95064c2434bd9e2f312edd5e9ae275dc) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 89 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 89 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e2e2211..0570453 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1943,6 +1944,58 @@ static void print_capabilities(void) + printf("}\n"); + } + ++/* This magic is based on lxc's lxc_pivot_root() */ ++static void setup_pivot_root(const char *source) ++{ ++ int oldroot; ++ int newroot; ++ ++ oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); ++ if (oldroot < 0) { ++ fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); ++ exit(1); ++ } ++ ++ newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC); ++ if (newroot < 0) { ++ fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source); ++ exit(1); ++ } ++ ++ if (fchdir(newroot) < 0) { ++ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); ++ exit(1); ++ } ++ ++ if (syscall(__NR_pivot_root, ".", ".") < 0) { ++ fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n"); ++ exit(1); ++ } ++ ++ if (fchdir(oldroot) < 0) { ++ fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n"); ++ exit(1); ++ } ++ ++ if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n"); ++ exit(1); ++ } ++ ++ if (umount2(".", MNT_DETACH) < 0) { ++ fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n"); ++ exit(1); ++ } ++ ++ if (fchdir(newroot) < 0) { ++ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); ++ exit(1); ++ } ++ ++ close(newroot); ++ close(oldroot); ++} ++ + static void setup_proc_self_fd(struct lo_data *lo) + { + lo->proc_self_fd = open("/proc/self/fd", O_PATH); +@@ -1952,6 +2005,39 @@ static void setup_proc_self_fd(struct lo_data *lo) + } + } + ++/* ++ * Make the source directory our root so symlinks cannot escape and no other ++ * files are accessible. ++ */ ++static void setup_mount_namespace(const char *source) ++{ ++ if (unshare(CLONE_NEWNS) != 0) { ++ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNS): %m\n"); ++ exit(1); ++ } ++ ++ if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_PRIVATE): %m\n"); ++ exit(1); ++ } ++ ++ if (mount(source, source, NULL, MS_BIND, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); ++ exit(1); ++ } ++ ++ setup_pivot_root(source); ++} ++ ++/* ++ * Lock down this process to prevent access to other processes or files outside ++ * source directory. This reduces the impact of arbitrary code execution bugs. ++ */ ++static void setup_sandbox(struct lo_data *lo) ++{ ++ setup_mount_namespace(lo->source); ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2052,6 +2138,7 @@ int main(int argc, char *argv[]) + } + + lo.root.fd = open(lo.source, O_PATH); ++ + if (lo.root.fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", lo.source); + exit(1); +@@ -2075,6 +2162,8 @@ int main(int argc, char *argv[]) + /* Must be after daemonize to get the right /proc/self/fd */ + setup_proc_self_fd(&lo); + ++ setup_sandbox(&lo); ++ + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch b/kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch new file mode 100755 index 0000000..e54248c --- /dev/null +++ b/kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch @@ -0,0 +1,93 @@ +From 4cc435b3a8a9a419cc85ee883d5184f810f91e52 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:34 +0100 +Subject: [PATCH 063/116] virtiofsd: set maximum RLIMIT_NOFILE limit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-60-dgilbert@redhat.com> +Patchwork-id: 93516 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 059/112] virtiofsd: set maximum RLIMIT_NOFILE limit +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +virtiofsd can exceed the default open file descriptor limit easily on +most systems. Take advantage of the fact that it runs as root to raise +the limit. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 01a6dc95ec7f71eeff9963fe3cb03d85225fba3e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 32 ++++++++++++++++++++++++++++++++ + 1 file changed, 32 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index d53cb1e..c281d81 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2268,6 +2269,35 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) + setup_seccomp(); + } + ++/* Raise the maximum number of open file descriptors */ ++static void setup_nofile_rlimit(void) ++{ ++ const rlim_t max_fds = 1000000; ++ struct rlimit rlim; ++ ++ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { ++ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); ++ exit(1); ++ } ++ ++ if (rlim.rlim_cur >= max_fds) { ++ return; /* nothing to do */ ++ } ++ ++ rlim.rlim_cur = max_fds; ++ rlim.rlim_max = max_fds; ++ ++ if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { ++ /* Ignore SELinux denials */ ++ if (errno == EPERM) { ++ return; ++ } ++ ++ fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n"); ++ exit(1); ++ } ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2389,6 +2419,8 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + ++ setup_nofile_rlimit(); ++ + /* Must be before sandbox since it wants /proc */ + setup_capng(); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch b/kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch new file mode 100755 index 0000000..ce74f4d --- /dev/null +++ b/kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch @@ -0,0 +1,88 @@ +From 301f19f2ebd617e43e3a8e7bdcf694de580fe689 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:56 +0100 +Subject: [PATCH 5/9] virtiofsd: stay below fs.file-max sysctl value + (CVE-2020-10717) + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-4-dgilbert@redhat.com> +Patchwork-id: 96271 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 3/7] virtiofsd: stay below fs.file-max sysctl value (CVE-2020-10717) +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +The system-wide fs.file-max sysctl value determines how many files can +be open. It defaults to a value calculated based on the machine's RAM +size. Previously virtiofsd would try to set RLIMIT_NOFILE to 1,000,000 +and this allowed the FUSE client to exhaust the number of open files +system-wide on Linux hosts with less than 10 GB of RAM! + +Take fs.file-max into account when choosing the default RLIMIT_NOFILE +value. + +Fixes: CVE-2020-10717 +Reported-by: Yuval Avrahami +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Dr. David Alan Gilbert +Message-Id: <20200501140644.220940-3-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8c1d353d107b4fc344e27f2f08ea7fa25de2eea2) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/helper.c | 26 +++++++++++++++++++++++++- + 1 file changed, 25 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 9b3eddc..5b222ea 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -176,7 +176,8 @@ void fuse_cmdline_help(void) + " default: no_xattr\n" + " --rlimit-nofile= set maximum number of file descriptors\n" + " (0 leaves rlimit unchanged)\n" +- " default: 1,000,000 if the current rlimit is lower\n" ++ " default: min(1000000, fs.file-max - 16384)\n" ++ " if the current rlimit is lower\n" + ); + } + +@@ -199,9 +200,32 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, + + static unsigned long get_default_rlimit_nofile(void) + { ++ g_autofree gchar *file_max_str = NULL; ++ const rlim_t reserved_fds = 16384; /* leave at least this many fds free */ + rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ ++ rlim_t file_max; + struct rlimit rlim; + ++ /* ++ * Reduce max_fds below the system-wide maximum, if necessary. This ++ * ensures there are fds available for other processes so we don't ++ * cause resource exhaustion. ++ */ ++ if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str, ++ NULL, NULL)) { ++ fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n"); ++ exit(1); ++ } ++ file_max = g_ascii_strtoull(file_max_str, NULL, 10); ++ if (file_max < 2 * reserved_fds) { ++ fuse_log(FUSE_LOG_ERR, ++ "The fs.file-max sysctl is too low (%lu) to allow a " ++ "reasonable number of open files.\n", ++ (unsigned long)file_max); ++ exit(1); ++ } ++ max_fds = MIN(file_max - reserved_fds, max_fds); ++ + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { + fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch b/kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch new file mode 100755 index 0000000..be6b244 --- /dev/null +++ b/kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch @@ -0,0 +1,72 @@ +From 06a24b54c94345b436d888a48b92fafa967c3d58 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:25 +0100 +Subject: [PATCH 114/116] virtiofsd: stop all queue threads on exit in + virtio_loop() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-111-dgilbert@redhat.com> +Patchwork-id: 93564 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 110/112] virtiofsd: stop all queue threads on exit in virtio_loop() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eryu Guan + +On guest graceful shutdown, virtiofsd receives VHOST_USER_GET_VRING_BASE +request from VMM and shuts down virtqueues by calling fv_set_started(), +which joins fv_queue_thread() threads. So when virtio_loop() returns, +there should be no thread is still accessing data in fuse session and/or +virtio dev. + +But on abnormal exit, e.g. guest got killed for whatever reason, +vhost-user socket is closed and virtio_loop() breaks out the main loop +and returns to main(). But it's possible fv_queue_worker()s are still +working and accessing fuse session and virtio dev, which results in +crash or use-after-free. + +Fix it by stopping fv_queue_thread()s before virtio_loop() returns, +to make sure there's no-one could access fuse session and virtio dev. + +Reported-by: Qingming Su +Signed-off-by: Eryu Guan +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9883df8ccae6d744a0c8d9cbf9d62b1797d70ebd) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 9f65823..80a6e92 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -815,6 +815,19 @@ int virtio_loop(struct fuse_session *se) + } + } + ++ /* ++ * Make sure all fv_queue_thread()s quit on exit, as we're about to ++ * free virtio dev and fuse session, no one should access them anymore. ++ */ ++ for (int i = 0; i < se->virtio_dev->nqueues; i++) { ++ if (!se->virtio_dev->qi[i]) { ++ continue; ++ } ++ ++ fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i); ++ fv_queue_cleanup_thread(se->virtio_dev, i); ++ } ++ + fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); + + return 0; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch b/kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch new file mode 100755 index 0000000..f595ffa --- /dev/null +++ b/kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch @@ -0,0 +1,83 @@ +From 1744329bcba4a3e1a82cec3b1a34b3fbf0a9d7cf Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:54 +0100 +Subject: [PATCH 083/116] virtiofsd: support nanosecond resolution for file + timestamp +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-80-dgilbert@redhat.com> +Patchwork-id: 93535 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 079/112] virtiofsd: support nanosecond resolution for file timestamp +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Jiufei Xue + +Define HAVE_STRUCT_STAT_ST_ATIM to 1 if `st_atim' is member of `struct +stat' which means support nanosecond resolution for the file timestamp +fields. + +Signed-off-by: Jiufei Xue +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8a792b034d4b315251fd842bb4c73a133aa1368f) +Signed-off-by: Miroslav Rezanina +--- + configure | 16 ++++++++++++++++ + tools/virtiofsd/fuse_misc.h | 1 + + 2 files changed, 17 insertions(+) + +diff --git a/configure b/configure +index 7831618..5120c14 100755 +--- a/configure ++++ b/configure +@@ -5218,6 +5218,19 @@ if compile_prog "" "" ; then + strchrnul=yes + fi + ++######################################### ++# check if we have st_atim ++ ++st_atim=no ++cat > $TMPC << EOF ++#include ++#include ++int main(void) { return offsetof(struct stat, st_atim); } ++EOF ++if compile_prog "" "" ; then ++ st_atim=yes ++fi ++ + ########################################## + # check if trace backend exists + +@@ -6919,6 +6932,9 @@ fi + if test "$strchrnul" = "yes" ; then + echo "HAVE_STRCHRNUL=y" >> $config_host_mak + fi ++if test "$st_atim" = "yes" ; then ++ echo "HAVE_STRUCT_STAT_ST_ATIM=y" >> $config_host_mak ++fi + if test "$byteswap_h" = "yes" ; then + echo "CONFIG_BYTESWAP_H=y" >> $config_host_mak + fi +diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h +index f252baa..5c618ce 100644 +--- a/tools/virtiofsd/fuse_misc.h ++++ b/tools/virtiofsd/fuse_misc.h +@@ -7,6 +7,7 @@ + */ + + #include ++#include "config-host.h" + + /* + * Versioned symbols cannot be used in some cases because it +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch b/kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch new file mode 100755 index 0000000..1bae1bf --- /dev/null +++ b/kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch @@ -0,0 +1,82 @@ +From 7bc27a767bc8c78b1bca46bbe5e1d53dcd7173b4 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:18 +0100 +Subject: [PATCH 107/116] virtiofsd: use fuse_buf_writev to replace + fuse_buf_write for better performance +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-104-dgilbert@redhat.com> +Patchwork-id: 93558 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 103/112] virtiofsd: use fuse_buf_writev to replace fuse_buf_write for better performance +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: piaojun + +fuse_buf_writev() only handles the normal write in which src is buffer +and dest is fd. Specially if src buffer represents guest physical +address that can't be mapped by the daemon process, IO must be bounced +back to the VMM to do it by fuse_buf_copy(). + +Signed-off-by: Jun Piao +Suggested-by: Dr. David Alan Gilbert +Suggested-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c465bba2c90a810f6e71e4f2646b1b4ee4b478de) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 37befeb..27c1377 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -34,7 +34,6 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv) + return size; + } + +-__attribute__((unused)) + static ssize_t fuse_buf_writev(struct fuse_buf *out_buf, + struct fuse_bufvec *in_buf) + { +@@ -262,12 +261,29 @@ static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) + + ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) + { +- size_t copied = 0; ++ size_t copied = 0, i; + + if (dstv == srcv) { + return fuse_buf_size(dstv); + } + ++ /* ++ * use writev to improve bandwidth when all the ++ * src buffers already mapped by the daemon ++ * process ++ */ ++ for (i = 0; i < srcv->count; i++) { ++ if (srcv->buf[i].flags & FUSE_BUF_IS_FD) { ++ break; ++ } ++ } ++ if ((i == srcv->count) && (dstv->count == 1) && ++ (dstv->idx == 0) && ++ (dstv->buf[0].flags & FUSE_BUF_IS_FD)) { ++ dstv->buf[0].pos += dstv->off; ++ return fuse_buf_writev(&dstv->buf[0], srcv); ++ } ++ + for (;;) { + const struct fuse_buf *src = fuse_bufvec_current(srcv); + const struct fuse_buf *dst = fuse_bufvec_current(dstv); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch b/kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch new file mode 100755 index 0000000..feffb5e --- /dev/null +++ b/kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch @@ -0,0 +1,56 @@ +From 1724f54070d33d8070ba2d22c8fac87ea65814c1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:04 +0100 +Subject: [PATCH 093/116] virtiofsd: use fuse_lowlevel_is_virtio() in + fuse_session_destroy() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-90-dgilbert@redhat.com> +Patchwork-id: 93540 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 089/112] virtiofsd: use fuse_lowlevel_is_virtio() in fuse_session_destroy() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +vu_socket_path is NULL when --fd=FDNUM was used. Use +fuse_lowlevel_is_virtio() instead. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 620e9d8d9cee6df7fe71168dea950dba0cc21a4a) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 70568d2..dab6a31 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2537,12 +2537,13 @@ void fuse_session_destroy(struct fuse_session *se) + close(se->fd); + } + +- if (se->vu_socket_path) { ++ if (fuse_lowlevel_is_virtio(se)) { + virtio_session_close(se); +- free(se->vu_socket_path); +- se->vu_socket_path = NULL; + } + ++ free(se->vu_socket_path); ++ se->vu_socket_path = NULL; ++ + free(se); + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch b/kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch new file mode 100755 index 0000000..f250ed7 --- /dev/null +++ b/kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch @@ -0,0 +1,390 @@ +From bce5070d1aada88154b811a08eec1586ab24fce5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:26 +0100 +Subject: [PATCH 055/116] virtiofsd: use /proc/self/fd/ O_PATH file descriptor +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-52-dgilbert@redhat.com> +Patchwork-id: 93506 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 051/112] virtiofsd: use /proc/self/fd/ O_PATH file descriptor +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Sandboxing will remove /proc from the mount namespace so we can no +longer build string paths into "/proc/self/fd/...". + +Keep an O_PATH file descriptor so we can still re-open fds via +/proc/self/fd. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9f59d175e2ca96f0b87f534dba69ea547dd35945) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 130 +++++++++++++++++++++++++++++++-------- + 1 file changed, 103 insertions(+), 27 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e3d65c3..e2e2211 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -110,6 +110,9 @@ struct lo_data { + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ + struct lo_map fd_map; /* protected by lo->mutex */ ++ ++ /* An O_PATH file descriptor to /proc/self/fd/ */ ++ int proc_self_fd; + }; + + static const struct fuse_opt lo_opts[] = { +@@ -379,9 +382,9 @@ static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, + int res; + + retry: +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); + +- res = readlink(procname, path, PATH_MAX); ++ res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX); + if (res < 0) { + fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__); + goto fail_noretry; +@@ -477,9 +480,9 @@ static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, + } + return res; + } +- sprintf(path, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "%i", inode->fd); + +- return utimensat(AT_FDCWD, path, tv, 0); ++ return utimensat(lo->proc_self_fd, path, tv, 0); + + fallback: + res = lo_parent_and_name(lo, inode, path, &parent); +@@ -535,8 +538,8 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + if (fi) { + res = fchmod(fd, attr->st_mode); + } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = chmod(procname, attr->st_mode); ++ sprintf(procname, "%i", ifd); ++ res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); + } + if (res == -1) { + goto out_err; +@@ -552,11 +555,23 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + } + } + if (valid & FUSE_SET_ATTR_SIZE) { ++ int truncfd; ++ + if (fi) { +- res = ftruncate(fd, attr->st_size); ++ truncfd = fd; + } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = truncate(procname, attr->st_size); ++ sprintf(procname, "%i", ifd); ++ truncfd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (truncfd < 0) { ++ goto out_err; ++ } ++ } ++ ++ res = ftruncate(truncfd, attr->st_size); ++ if (!fi) { ++ saverr = errno; ++ close(truncfd); ++ errno = saverr; + } + if (res == -1) { + goto out_err; +@@ -874,9 +889,9 @@ static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, + return res; + } + +- sprintf(path, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "%i", inode->fd); + +- return linkat(AT_FDCWD, path, dfd, name, AT_SYMLINK_FOLLOW); ++ return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW); + + fallback: + res = lo_parent_and_name(lo, inode, path, &parent); +@@ -1404,8 +1419,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + fi->flags &= ~O_APPEND; + } + +- sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); +- fd = open(buf, fi->flags & ~O_NOFOLLOW); ++ sprintf(buf, "%i", lo_fd(req, ino)); ++ fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); + if (fd == -1) { + return (void)fuse_reply_err(req, errno); + } +@@ -1458,7 +1473,6 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { + int res; +- (void)ino; + int fd; + char *buf; + +@@ -1466,12 +1480,14 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + (void *)fi); + + if (!fi) { +- res = asprintf(&buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ struct lo_data *lo = lo_data(req); ++ ++ res = asprintf(&buf, "%i", lo_fd(req, ino)); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + +- fd = open(buf, O_RDWR); ++ fd = openat(lo->proc_self_fd, buf, O_RDWR); + free(buf); + if (fd == -1) { + return (void)fuse_reply_err(req, errno); +@@ -1587,11 +1603,13 @@ static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + size_t size) + { ++ struct lo_data *lo = lo_data(req); + char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1616,7 +1634,11 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } + + if (size) { + value = malloc(size); +@@ -1624,7 +1646,7 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out_err; + } + +- ret = getxattr(procname, name, value, size); ++ ret = fgetxattr(fd, name, value, size); + if (ret == -1) { + goto out_err; + } +@@ -1635,7 +1657,7 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + + fuse_reply_buf(req, value, ret); + } else { +- ret = getxattr(procname, name, NULL, 0); ++ ret = fgetxattr(fd, name, NULL, 0); + if (ret == -1) { + goto out_err; + } +@@ -1644,6 +1666,10 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + } + out_free: + free(value); ++ ++ if (fd >= 0) { ++ close(fd); ++ } + return; + + out_err: +@@ -1655,11 +1681,13 @@ out: + + static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + { ++ struct lo_data *lo = lo_data(req); + char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1683,7 +1711,11 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } + + if (size) { + value = malloc(size); +@@ -1691,7 +1723,7 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out_err; + } + +- ret = listxattr(procname, value, size); ++ ret = flistxattr(fd, value, size); + if (ret == -1) { + goto out_err; + } +@@ -1702,7 +1734,7 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + + fuse_reply_buf(req, value, ret); + } else { +- ret = listxattr(procname, NULL, 0); ++ ret = flistxattr(fd, NULL, 0); + if (ret == -1) { + goto out_err; + } +@@ -1711,6 +1743,10 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + } + out_free: + free(value); ++ ++ if (fd >= 0) { ++ close(fd); ++ } + return; + + out_err: +@@ -1724,9 +1760,11 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags) + { + char procname[64]; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1751,21 +1789,31 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } + +- ret = setxattr(procname, name, value, size, flags); ++ ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + + out: ++ if (fd >= 0) { ++ close(fd); ++ } + fuse_reply_err(req, saverr); + } + + static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + { + char procname[64]; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1789,12 +1837,20 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } + +- ret = removexattr(procname, name); ++ ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; + + out: ++ if (fd >= 0) { ++ close(fd); ++ } + fuse_reply_err(req, saverr); + } + +@@ -1887,12 +1943,25 @@ static void print_capabilities(void) + printf("}\n"); + } + ++static void setup_proc_self_fd(struct lo_data *lo) ++{ ++ lo->proc_self_fd = open("/proc/self/fd", O_PATH); ++ if (lo->proc_self_fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); ++ exit(1); ++ } ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); + struct fuse_session *se; + struct fuse_cmdline_opts opts; +- struct lo_data lo = { .debug = 0, .writeback = 0 }; ++ struct lo_data lo = { ++ .debug = 0, ++ .writeback = 0, ++ .proc_self_fd = -1, ++ }; + struct lo_map_elem *root_elem; + int ret = -1; + +@@ -2003,6 +2072,9 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + ++ /* Must be after daemonize to get the right /proc/self/fd */ ++ setup_proc_self_fd(&lo); ++ + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + +@@ -2018,6 +2090,10 @@ err_out1: + lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); + ++ if (lo.proc_self_fd >= 0) { ++ close(lo.proc_self_fd); ++ } ++ + if (lo.root.fd >= 0) { + close(lo.root.fd); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch b/kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch new file mode 100755 index 0000000..d60a902 --- /dev/null +++ b/kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch @@ -0,0 +1,137 @@ +From 6877a6c456178d6c1ca9a0ffaabaa7e51105b2ac Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:22 +0100 +Subject: [PATCH 051/116] virtiofsd: validate input buffer sizes in + do_write_buf() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-48-dgilbert@redhat.com> +Patchwork-id: 93501 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 047/112] virtiofsd: validate input buffer sizes in do_write_buf() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +There is a small change in behavior: if fuse_write_in->size doesn't +match the input buffer size then the request is failed. Previously +write requests with 1 fuse_buf element would truncate to +fuse_write_in->size. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0ba8c3c6fce8fe949d59c1fd84d98d220ef9e759) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 49 +++++++++++++++++++++++++---------------- + 1 file changed, 30 insertions(+), 19 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 7e10995..611e8b0 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1003,8 +1003,8 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, +- struct fuse_bufvec *ibufv) ++static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter, struct fuse_bufvec *ibufv) + { + struct fuse_session *se = req->se; + struct fuse_bufvec *pbufv = ibufv; +@@ -1012,28 +1012,27 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + .buf[0] = ibufv->buf[0], + .count = 1, + }; +- struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_write_in *arg; ++ size_t arg_size = sizeof(*arg); + struct fuse_file_info fi; + + memset(&fi, 0, sizeof(fi)); ++ ++ arg = fuse_mbuf_iter_advance(iter, arg_size); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; + fi.fh = arg->fh; + fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; + + if (ibufv->count == 1) { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- tmpbufv.buf[0].mem = PARAM(arg); +- } +- tmpbufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); +- if (tmpbufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: do_write_buf: buffer size too small\n"); +- fuse_reply_err(req, EIO); +- return; +- } +- tmpbufv.buf[0].size = arg->size; ++ assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)); ++ tmpbufv.buf[0].mem = ((char *)arg) + arg_size; ++ tmpbufv.buf[0].size -= sizeof(struct fuse_in_header) + arg_size; + pbufv = &tmpbufv; + } else { + /* +@@ -1043,6 +1042,13 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + ibufv->buf[0].size = 0; + } + ++ if (fuse_buf_size(pbufv) != arg->size) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: do_write_buf: buffer size doesn't match arg->size\n"); ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ + se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); + } + +@@ -2052,12 +2058,17 @@ void fuse_session_process_buf_int(struct fuse_session *se, + struct fuse_chan *ch) + { + const struct fuse_buf *buf = bufv->buf; ++ struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf); + struct fuse_in_header *in; + const void *inarg; + struct fuse_req *req; + int err; + +- in = buf->mem; ++ /* The first buffer must be a memory buffer */ ++ assert(!(buf->flags & FUSE_BUF_IS_FD)); ++ ++ in = fuse_mbuf_iter_advance(&iter, sizeof(*in)); ++ assert(in); /* caller guarantees the input buffer is large enough */ + + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, +@@ -2129,7 +2140,7 @@ void fuse_session_process_buf_int(struct fuse_session *se, + + inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { +- do_write_buf(req, in->nodeid, inarg, bufv); ++ do_write_buf(req, in->nodeid, &iter, bufv); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-validate-path-components.patch b/kvm-virtiofsd-validate-path-components.patch new file mode 100755 index 0000000..b35aed7 --- /dev/null +++ b/kvm-virtiofsd-validate-path-components.patch @@ -0,0 +1,164 @@ +From 69ac47502848c37ca3ede00f432c0675d9eef42c Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:18 +0100 +Subject: [PATCH 047/116] virtiofsd: validate path components +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-44-dgilbert@redhat.com> +Patchwork-id: 93498 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 043/112] virtiofsd: validate path components +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Several FUSE requests contain single path components. A correct FUSE +client sends well-formed path components but there is currently no input +validation in case something went wrong or the client is malicious. + +Refuse ".", "..", and paths containing '/' when we expect a path +component. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 25dae28c58d7e706b5d5db99042c9db3cef2e657) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 59 ++++++++++++++++++++++++++++++++++++---- + 1 file changed, 53 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ac380ef..e375406 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -133,6 +133,21 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); + ++static int is_dot_or_dotdot(const char *name) ++{ ++ return name[0] == '.' && ++ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); ++} ++ ++/* Is `path` a single path component that is not "." or ".."? */ ++static int is_safe_path_component(const char *path) ++{ ++ if (strchr(path, '/')) { ++ return 0; ++ } ++ ++ return !is_dot_or_dotdot(path); ++} + + static struct lo_data *lo_data(fuse_req_t req) + { +@@ -681,6 +696,15 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + parent, name); + } + ++ /* ++ * Don't use is_safe_path_component(), allow "." and ".." for NFS export ++ * support. ++ */ ++ if (strchr(name, '/')) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + err = lo_do_lookup(req, parent, name, &e); + if (err) { + fuse_reply_err(req, err); +@@ -762,6 +786,11 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + struct fuse_entry_param e; + struct lo_cred old = {}; + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + dir = lo_inode(req, parent); + if (!dir) { + fuse_reply_err(req, EBADF); +@@ -863,6 +892,11 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + struct fuse_entry_param e; + int saverr; + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); +@@ -904,6 +938,10 @@ out_err: + static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + +@@ -916,6 +954,11 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + { + int res; + ++ if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + if (flags) { + fuse_reply_err(req, EINVAL); + return; +@@ -930,6 +973,11 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + res = unlinkat(lo_fd(req, parent), name, 0); + + fuse_reply_err(req, res == -1 ? errno : 0); +@@ -1093,12 +1141,6 @@ out_err: + fuse_reply_err(req, error); + } + +-static int is_dot_or_dotdot(const char *name) +-{ +- return name[0] == '.' && +- (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); +-} +- + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { +@@ -1248,6 +1290,11 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + parent, name); + } + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + err = lo_change_cred(req, &old); + if (err) { + goto out; +-- +1.8.3.1 + diff --git a/kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch b/kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch new file mode 100755 index 0000000..20add81 --- /dev/null +++ b/kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch @@ -0,0 +1,56 @@ +From 247987aa987b7332eb501e00c440079b9e8e1fe7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:52 +0100 +Subject: [PATCH 021/116] vitriofsd/passthrough_ll: fix fallocate() ifdefs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-18-dgilbert@redhat.com> +Patchwork-id: 93471 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 017/112] vitriofsd/passthrough_ll: fix fallocate() ifdefs +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Xiao Yang + +1) Use correct CONFIG_FALLOCATE macro to check if fallocate() is supported.(i.e configure + script sets CONFIG_FALLOCATE intead of HAVE_FALLOCATE if fallocate() is supported) +2) Replace HAVE_POSIX_FALLOCATE with CONFIG_POSIX_FALLOCATE. + +Signed-off-by: Xiao Yang +Signed-off-by: Dr. David Alan Gilbert + Merged from two of Xiao Yang's patches +(cherry picked from commit 9776457ca6f05d5900e27decb1dba2ffddf95a22) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 322a889..6c4da18 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -975,13 +975,13 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + int err = EOPNOTSUPP; + (void)ino; + +-#ifdef HAVE_FALLOCATE ++#ifdef CONFIG_FALLOCATE + err = fallocate(fi->fh, mode, offset, length); + if (err < 0) { + err = errno; + } + +-#elif defined(HAVE_POSIX_FALLOCATE) ++#elif defined(CONFIG_POSIX_FALLOCATE) + if (mode) { + fuse_reply_err(req, EOPNOTSUPP); + return; +-- +1.8.3.1 + diff --git a/kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch b/kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch new file mode 100755 index 0000000..dbcf2a7 --- /dev/null +++ b/kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch @@ -0,0 +1,63 @@ +From ad50e0e2d310277f06a9c512fe6e31da183ead6e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:34 -0500 +Subject: [PATCH 1/4] x86/cpu: Enable AVX512_VP2INTERSECT cpu feature + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-2-dgilbert@redhat.com> +Patchwork-id: 101203 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/4] x86/cpu: Enable AVX512_VP2INTERSECT cpu feature +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Cathy Zhang + +AVX512_VP2INTERSECT compute vector pair intersection to a pair +of mask registers, which is introduced with intel Tiger Lake, +defining as CPUID.(EAX=7,ECX=0):EDX[bit 08]. + +Refer to the following release spec: +https://software.intel.com/sites/default/files/managed/c5/15/\ +architecture-instruction-set-extensions-programming-reference.pdf + +Signed-off-by: Cathy Zhang +Message-Id: <1586760758-13638-1-git-send-email-cathy.zhang@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 353f98c9ad52ff4b8cfe553c90be04f747a14c98) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 2 +- + target/i386/cpu.h | 2 ++ + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index ff39fc9905..67dab94aa5 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1078,7 +1078,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .feat_names = { + NULL, NULL, "avx512-4vnniw", "avx512-4fmaps", + NULL, NULL, NULL, NULL, +- NULL, NULL, "md-clear", NULL, ++ "avx512-vp2intersect", NULL, "md-clear", NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL /* pconfig */, NULL, + NULL, NULL, NULL, NULL, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f3da25cb8a..8e2e52ed31 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -770,6 +770,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2) + /* AVX512 Multiply Accumulation Single Precision */ + #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) ++/* AVX512 Vector Pair Intersection to a Pair of Mask Registers */ ++#define CPUID_7_0_EDX_AVX512_VP2INTERSECT (1U << 8) + /* Speculation Control */ + #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) + /* Single Thread Indirect Branch Predictors */ +-- +2.27.0 + diff --git a/kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch b/kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch new file mode 100755 index 0000000..9ef6d04 --- /dev/null +++ b/kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch @@ -0,0 +1,91 @@ +From 655e723a5190206302f6cc4f2e794563b8e1c226 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:36 -0500 +Subject: [PATCH 3/4] x86/cpu: Populate SVM CPUID feature bits + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-4-dgilbert@redhat.com> +Patchwork-id: 101200 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/4] x86/cpu: Populate SVM CPUID feature bits +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Wei Huang + +Newer AMD CPUs will add CPUID_0x8000000A_EDX[28] bit, which indicates +that SVM instructions (VMRUN/VMSAVE/VMLOAD) will trigger #VMEXIT before +CPU checking their EAX against reserved memory regions. This change will +allow the hypervisor to avoid intercepting #GP and emulating SVM +instructions. KVM turns on this CPUID bit for nested VMs. In order to +support it, let us populate this bit, along with other SVM feature bits, +in FEAT_SVM. + +Signed-off-by: Wei Huang +Message-Id: <20210126202456.589932-1-wei.huang2@amd.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 5447089c2b3b084b51670af36fc86ee3979e04be) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 6 +++--- + target/i386/cpu.h | 24 ++++++++++++++---------- + 2 files changed, 17 insertions(+), 13 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index f6a9ed84b3..7227c803c3 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1026,11 +1026,11 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "npt", "lbrv", "svm-lock", "nrip-save", + "tsc-scale", "vmcb-clean", "flushbyasid", "decodeassists", + NULL, NULL, "pause-filter", NULL, +- "pfthreshold", NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, ++ "pfthreshold", "avic", NULL, "v-vmsave-vmload", ++ "vgif", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, ++ "svme-addr-chk", NULL, NULL, NULL, + }, + .cpuid = { .eax = 0x8000000A, .reg = R_EDX, }, + .tcg_features = TCG_SVM_FEATURES, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f5a4efcec6..e1b67910c2 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -667,16 +667,20 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_EXT3_PERFCORE (1U << 23) + #define CPUID_EXT3_PERFNB (1U << 24) + +-#define CPUID_SVM_NPT (1U << 0) +-#define CPUID_SVM_LBRV (1U << 1) +-#define CPUID_SVM_SVMLOCK (1U << 2) +-#define CPUID_SVM_NRIPSAVE (1U << 3) +-#define CPUID_SVM_TSCSCALE (1U << 4) +-#define CPUID_SVM_VMCBCLEAN (1U << 5) +-#define CPUID_SVM_FLUSHASID (1U << 6) +-#define CPUID_SVM_DECODEASSIST (1U << 7) +-#define CPUID_SVM_PAUSEFILTER (1U << 10) +-#define CPUID_SVM_PFTHRESHOLD (1U << 12) ++#define CPUID_SVM_NPT (1U << 0) ++#define CPUID_SVM_LBRV (1U << 1) ++#define CPUID_SVM_SVMLOCK (1U << 2) ++#define CPUID_SVM_NRIPSAVE (1U << 3) ++#define CPUID_SVM_TSCSCALE (1U << 4) ++#define CPUID_SVM_VMCBCLEAN (1U << 5) ++#define CPUID_SVM_FLUSHASID (1U << 6) ++#define CPUID_SVM_DECODEASSIST (1U << 7) ++#define CPUID_SVM_PAUSEFILTER (1U << 10) ++#define CPUID_SVM_PFTHRESHOLD (1U << 12) ++#define CPUID_SVM_AVIC (1U << 13) ++#define CPUID_SVM_V_VMSAVE_VMLOAD (1U << 15) ++#define CPUID_SVM_VGIF (1U << 16) ++#define CPUID_SVM_SVME_ADDR_CHK (1U << 28) + + /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ + #define CPUID_7_0_EBX_FSGSBASE (1U << 0) +-- +2.27.0 + diff --git a/kvm-x86.conf b/kvm-x86.conf new file mode 100755 index 0000000..3f7842a --- /dev/null +++ b/kvm-x86.conf @@ -0,0 +1,12 @@ +# Setting modprobe kvm_intel/kvm_amd nested = 1 +# only enables Nested Virtualization until the next reboot or +# module reload. Uncomment the option applicable +# to your system below to enable the feature permanently. +# +# User changes in this file are preserved across upgrades. +# +# For Intel +#options kvm_intel nested=1 +# +# For AMD +#options kvm_amd nested=1 diff --git a/kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch b/kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch new file mode 100755 index 0000000..aabe041 --- /dev/null +++ b/kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch @@ -0,0 +1,76 @@ +From f38f51d422e82d1241b678960dd6a033ffa398da Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:05 -0400 +Subject: [PATCH 6/7] xhci: fix valid.max_access_size to access address + registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-6-jmaloy@redhat.com> +Patchwork-id: 101483 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 5/6] xhci: fix valid.max_access_size to access address registers +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Laurent Vivier + +QEMU XHCI advertises AC64 (64-bit addressing) but doesn't allow +64-bit mode access in "runtime" and "operational" MemoryRegionOps. + +Set the max_access_size based on sizeof(dma_addr_t) as AC64 is set. + +XHCI specs: +"If the xHC supports 64-bit addressing (AC64 = ‘1’), then software +should write 64-bit registers using only Qword accesses. If a +system is incapable of issuing Qword accesses, then writes to the +64-bit address fields shall be performed using 2 Dword accesses; +low Dword-first, high-Dword second. If the xHC supports 32-bit +addressing (AC64 = ‘0’), then the high Dword of registers containing +64-bit address fields are unused and software should write addresses +using only Dword accesses" + +The problem has been detected with SLOF, as linux kernel always accesses +registers using 32-bit access even if AC64 is set and revealed by +5d971f9e6725 ("memory: Revert "memory: accept mismatching sizes in memory_region_access_valid"") + +Suggested-by: Alexey Kardashevskiy +Signed-off-by: Laurent Vivier +Message-id: 20200721083322.90651-1-lvivier@redhat.com +Signed-off-by: Gerd Hoffmann + +(cherry picked from commit 8e67fda2dd6202ccec093fda561107ba14830a17) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-xhci.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index 646c78cde9..ab449bb003 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -3183,7 +3183,7 @@ static const MemoryRegionOps xhci_oper_ops = { + .read = xhci_oper_read, + .write = xhci_oper_write, + .valid.min_access_size = 4, +- .valid.max_access_size = 4, ++ .valid.max_access_size = sizeof(dma_addr_t), + .endianness = DEVICE_LITTLE_ENDIAN, + }; + +@@ -3199,7 +3199,7 @@ static const MemoryRegionOps xhci_runtime_ops = { + .read = xhci_runtime_read, + .write = xhci_runtime_write, + .valid.min_access_size = 4, +- .valid.max_access_size = 4, ++ .valid.max_access_size = sizeof(dma_addr_t), + .endianness = DEVICE_LITTLE_ENDIAN, + }; + +-- +2.27.0 + diff --git a/kvm-xhci-recheck-slot-status.patch b/kvm-xhci-recheck-slot-status.patch new file mode 100755 index 0000000..8bcbc2c --- /dev/null +++ b/kvm-xhci-recheck-slot-status.patch @@ -0,0 +1,77 @@ +From ab87c0ed2a8f0a626099261a3028bc34cfac3929 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 14 Jan 2020 20:23:31 +0000 +Subject: [PATCH 5/5] xhci: recheck slot status +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200114202331.51831-3-dgilbert@redhat.com> +Patchwork-id: 93345 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] xhci: recheck slot status +Bugzilla: 1790844 +RH-Acked-by: Peter Xu +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Gerd Hoffmann + +From: Gerd Hoffmann + +Factor out slot status check into a helper function. Add an additional +check after completing transfers. This is needed in case a guest +queues multiple transfers in a row and a device unplug happens while +qemu processes them. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1786413 +Signed-off-by: Gerd Hoffmann +Reviewed-by: Philippe Mathieu-Daudé +Message-id: 20200107083606.12393-1-kraxel@redhat.com +(cherry picked from commit 236846a019c4f7aa3111026fc9a1fe09684c8978) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-xhci.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index d2b9744..646c78c 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -1861,6 +1861,13 @@ static void xhci_kick_ep(XHCIState *xhci, unsigned int slotid, + xhci_kick_epctx(epctx, streamid); + } + ++static bool xhci_slot_ok(XHCIState *xhci, int slotid) ++{ ++ return (xhci->slots[slotid - 1].uport && ++ xhci->slots[slotid - 1].uport->dev && ++ xhci->slots[slotid - 1].uport->dev->attached); ++} ++ + static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid) + { + XHCIState *xhci = epctx->xhci; +@@ -1878,9 +1885,7 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid) + + /* If the device has been detached, but the guest has not noticed this + yet the 2 above checks will succeed, but we must NOT continue */ +- if (!xhci->slots[epctx->slotid - 1].uport || +- !xhci->slots[epctx->slotid - 1].uport->dev || +- !xhci->slots[epctx->slotid - 1].uport->dev->attached) { ++ if (!xhci_slot_ok(xhci, epctx->slotid)) { + return; + } + +@@ -1987,6 +1992,10 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid) + } else { + xhci_fire_transfer(xhci, xfer, epctx); + } ++ if (!xhci_slot_ok(xhci, epctx->slotid)) { ++ /* surprise removal -> stop processing */ ++ break; ++ } + if (xfer->complete) { + /* update ring dequeue ptr */ + xhci_set_ep_state(xhci, epctx, stctx, epctx->state); +-- +1.8.3.1 + diff --git a/kvm-xics-Don-t-deassert-outputs.patch b/kvm-xics-Don-t-deassert-outputs.patch new file mode 100755 index 0000000..08ed724 --- /dev/null +++ b/kvm-xics-Don-t-deassert-outputs.patch @@ -0,0 +1,52 @@ +From 99b6ee4b7f63ea49e5b73f61bbf68f67252f27da Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 21 Jan 2020 05:16:12 +0000 +Subject: [PATCH 02/15] xics: Don't deassert outputs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200121051613.388295-3-dgibson@redhat.com> +Patchwork-id: 93430 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 2/3] xics: Don't deassert outputs +Bugzilla: 1776638 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +The correct way to do this is to deassert the input pins on the CPU side. +This is the case since a previous change. + +Signed-off-by: Greg Kurz +Message-Id: <157548862298.3650476.1228720391270249433.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 4febcdd88f08422a66a1aa0dc55e1472abed3c4b) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1776638 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/xics.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/hw/intc/xics.c b/hw/intc/xics.c +index e7ac9ba..72c5dca 100644 +--- a/hw/intc/xics.c ++++ b/hw/intc/xics.c +@@ -289,9 +289,6 @@ void icp_reset(ICPState *icp) + icp->pending_priority = 0xff; + icp->mfrr = 0xff; + +- /* Make all outputs are deasserted */ +- qemu_set_irq(icp->output, 0); +- + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + +-- +1.8.3.1 + diff --git a/kvm.conf b/kvm.conf new file mode 100755 index 0000000..24e60e9 --- /dev/null +++ b/kvm.conf @@ -0,0 +1,3 @@ +# +# User changes in this file are preserved across upgrades. +# diff --git a/qemu-ga.sysconfig b/qemu-ga.sysconfig new file mode 100755 index 0000000..67bad0c --- /dev/null +++ b/qemu-ga.sysconfig @@ -0,0 +1,19 @@ +# This is a systemd environment file, not a shell script. +# It provides settings for "/lib/systemd/system/qemu-guest-agent.service". + +# Comma-separated blacklist of RPCs to disable, or empty list to enable all. +# +# You can get the list of RPC commands using "qemu-ga --blacklist='?'". +# There should be no spaces between commas and commands in the blacklist. +BLACKLIST_RPC=guest-file-open,guest-file-close,guest-file-read,guest-file-write,guest-file-seek,guest-file-flush,guest-exec,guest-exec-status + +# Fsfreeze hook script specification. +# +# FSFREEZE_HOOK_PATHNAME=/dev/null : disables the feature. +# +# FSFREEZE_HOOK_PATHNAME=/path/to/executable : enables the feature with the +# specified binary or shell script. +# +# FSFREEZE_HOOK_PATHNAME= : enables the feature with the +# default value (invoke "qemu-ga --help" to interrogate). +FSFREEZE_HOOK_PATHNAME=/etc/qemu-ga/fsfreeze-hook diff --git a/qemu-guest-agent.service b/qemu-guest-agent.service new file mode 100755 index 0000000..b33e951 --- /dev/null +++ b/qemu-guest-agent.service @@ -0,0 +1,20 @@ +[Unit] +Description=QEMU Guest Agent +BindsTo=dev-virtio\x2dports-org.qemu.guest_agent.0.device +After=dev-virtio\x2dports-org.qemu.guest_agent.0.device +IgnoreOnIsolate=True + +[Service] +UMask=0077 +EnvironmentFile=/etc/sysconfig/qemu-ga +ExecStart=/usr/bin/qemu-ga \ + --method=virtio-serial \ + --path=/dev/virtio-ports/org.qemu.guest_agent.0 \ + --blacklist=${BLACKLIST_RPC} \ + -F${FSFREEZE_HOOK_PATHNAME} +StandardError=syslog +Restart=always +RestartSec=0 + +[Install] +WantedBy=dev-virtio\x2dports-org.qemu.guest_agent.0.device diff --git a/qemu-kvm.spec b/qemu-kvm.spec new file mode 100755 index 0000000..1e341d3 --- /dev/null +++ b/qemu-kvm.spec @@ -0,0 +1,4271 @@ +%global SLOF_gittagdate 20191022 +%global SLOF_gittagcommit 899d9883 + +%global have_usbredir 1 +%global have_spice 1 +%global have_opengl 1 +%global have_fdt 0 +%global have_gluster 1 +%global have_kvm_setup 0 +%global have_memlock_limits 0 + +%ifnarch %{ix86} x86_64 + %global have_usbredir 0 +%endif + +%ifnarch s390x + %global have_librdma 1 +%else + %global have_librdma 0 +%endif + +%ifarch %{ix86} + %global kvm_target i386 +%endif +%ifarch x86_64 + %global kvm_target x86_64 +%else + %global have_spice 0 + %global have_opengl 0 + %global have_gluster 0 +%endif +%ifarch %{power64} + %global kvm_target ppc64 + %global have_fdt 1 + %global have_kvm_setup 1 + %global have_memlock_limits 1 +%endif +%ifarch s390x + %global kvm_target s390x + %global have_kvm_setup 1 +%endif +%ifarch ppc + %global kvm_target ppc + %global have_fdt 1 +%endif +%ifarch aarch64 + %global kvm_target aarch64 + %global have_fdt 1 +%endif + +#Versions of various parts: + +%global requires_all_modules \ +Requires: %{name}-block-curl = %{epoch}:%{version}-%{release} \ +%if %{have_gluster} \ +Requires: %{name}-block-gluster = %{epoch}:%{version}-%{release} \ +%endif \ +Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \ +Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \ +Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release} + +# Macro to properly setup RHEL/RHEV conflict handling +%define rhev_ma_conflicts() \ +Obsoletes: %1-ma <= %{epoch}:%{version}-%{release} \ +Obsoletes: %1-rhev <= %{epoch}:%{version}-%{release} + +Summary: QEMU is a machine emulator and virtualizer +Name: qemu-kvm +Version: 4.2.0 +Release: 59%{?dist}.2 +# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped +Epoch: 15 +License: GPLv2 and GPLv2+ and CC-BY +Group: Development/Tools +URL: http://www.qemu.org/ +ExclusiveArch: x86_64 %{power64} aarch64 s390x + + +Source0: http://wiki.qemu.org/download/qemu-4.2.0.tar.xz + +# KSM control scripts +Source4: ksm.service +Source5: ksm.sysconfig +Source6: ksmctl.c +Source7: ksmtuned.service +Source8: ksmtuned +Source9: ksmtuned.conf +Source10: qemu-guest-agent.service +Source11: 99-qemu-guest-agent.rules +Source12: bridge.conf +Source13: qemu-ga.sysconfig +Source21: kvm-setup +Source22: kvm-setup.service +Source23: 85-kvm.preset +Source26: vhost.conf +Source27: kvm.conf +Source28: 95-kvm-memlock.conf +Source30: kvm-s390x.conf +Source31: kvm-x86.conf +Source32: qemu-pr-helper.service +Source33: qemu-pr-helper.socket +Source34: 81-kvm-rhel.rules +Source35: udev-kvm-check.c +Source36: README.tests + + +Patch0005: 0005-Initial-redhat-build.patch +Patch0006: 0006-Enable-disable-devices-for-RHEL.patch +Patch0007: 0007-Machine-type-related-general-changes.patch +Patch0008: 0008-Add-aarch64-machine-types.patch +Patch0009: 0009-Add-ppc64-machine-types.patch +Patch0010: 0010-Add-s390x-machine-types.patch +Patch0011: 0011-Add-x86_64-machine-types.patch +Patch0012: 0012-Enable-make-check.patch +Patch0013: 0013-vfio-cap-number-of-devices-that-can-be-assigned.patch +Patch0014: 0014-Add-support-statement-to-help-output.patch +Patch0015: 0015-globally-limit-the-maximum-number-of-CPUs.patch +Patch0016: 0016-Add-support-for-simpletrace.patch +Patch0017: 0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch +Patch0018: 0018-usb-xhci-Fix-PCI-capability-order.patch +Patch0019: 0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch +Patch0020: 0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch +Patch0021: 0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch +# For bz#1741345 - Remove the "cpu64-rhel6" CPU from qemu-kvm +Patch22: kvm-i386-Remove-cpu64-rhel6-CPU-model.patch +# For bz#1772774 - qemu-kvm core dump during migration+reboot ( Assertion `mem->dirty_bmap' failed ) +Patch23: kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch +# For bz#1733893 - Boot a guest with "-prom-env 'auto-boot?=false'", SLOF failed to enter the boot entry after input "boot" followed by "0 > " on VNC +Patch24: kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch +# For bz#1782678 - qemu core dump after hot-unplugging the XXV710/XL710 PF +Patch25: kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch +# For bz#1789301 - virtio-blk/scsi: fix notification suppression during AioContext polling +Patch26: kvm-virtio-don-t-enable-notifications-during-polling.patch +# For bz#1790844 - USB related fixes +Patch27: kvm-usbredir-Prevent-recursion-in-usbredir_write.patch +# For bz#1790844 - USB related fixes +Patch28: kvm-xhci-recheck-slot-status.patch +# For bz#1791568 - CVE-2020-7039 qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-av-8.2.0] +Patch29: kvm-tcp_emu-Fix-oob-access.patch +# For bz#1791568 - CVE-2020-7039 qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-av-8.2.0] +Patch30: kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch +# For bz#1791568 - CVE-2020-7039 qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-av-8.2.0] +Patch31: kvm-slirp-use-correct-size-while-emulating-commands.patch +# For bz#1559846 - Nested KVM: limit VMX features according to CPU models - Fast Train +Patch32: kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch +# For bz#1725084 - aarch64: support dumping SVE registers +Patch33: kvm-target-arm-arch_dump-Add-SVE-notes.patch +# For bz#1779041 - netkvm: no connectivity Windows guest with q35 + hugepages + vhost + hv_synic +Patch34: kvm-vhost-Add-names-to-section-rounded-warning.patch +# For bz#1779041 - netkvm: no connectivity Windows guest with q35 + hugepages + vhost + hv_synic +Patch35: kvm-vhost-Only-align-sections-for-vhost-user.patch +# For bz#1779041 - netkvm: no connectivity Windows guest with q35 + hugepages + vhost + hv_synic +Patch36: kvm-vhost-coding-style-fix.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch37: kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch38: kvm-vhost-user-fs-remove-vhostfd-property.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch39: kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch40: kvm-virtiofsd-Pull-in-upstream-headers.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch41: kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch42: kvm-virtiofsd-Add-auxiliary-.c-s.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch43: kvm-virtiofsd-Add-fuse_lowlevel.c.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch44: kvm-virtiofsd-Add-passthrough_ll.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch45: kvm-virtiofsd-Trim-down-imported-files.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch46: kvm-virtiofsd-Format-imported-files-to-qemu-style.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch47: kvm-virtiofsd-remove-mountpoint-dummy-argument.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch48: kvm-virtiofsd-remove-unused-notify-reply-support.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch49: kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch50: kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch51: kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch52: kvm-virtiofsd-Trim-out-compatibility-code.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch53: kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch54: kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch55: kvm-virtiofsd-Add-options-for-virtio.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch56: kvm-virtiofsd-add-o-source-PATH-to-help-output.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch57: kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch58: kvm-virtiofsd-Start-wiring-up-vhost-user.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch59: kvm-virtiofsd-Add-main-virtio-loop.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch60: kvm-virtiofsd-get-set-features-callbacks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch61: kvm-virtiofsd-Start-queue-threads.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch62: kvm-virtiofsd-Poll-kick_fd-for-queue.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch63: kvm-virtiofsd-Start-reading-commands-from-queue.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch64: kvm-virtiofsd-Send-replies-to-messages.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch65: kvm-virtiofsd-Keep-track-of-replies.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch66: kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch67: kvm-virtiofsd-Fast-path-for-virtio-read.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch68: kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch69: kvm-virtiofsd-make-f-foreground-the-default.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch70: kvm-virtiofsd-add-vhost-user.json-file.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch71: kvm-virtiofsd-add-print-capabilities-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch72: kvm-virtiofs-Add-maintainers-entry.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch73: kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch74: kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch75: kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch76: kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch77: kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch78: kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch79: kvm-virtiofsd-validate-path-components.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch80: kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch81: kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch82: kvm-virtiofsd-add-fuse_mbuf_iter-API.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch83: kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch84: kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch85: kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch86: kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch87: kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch88: kvm-virtiofsd-sandbox-mount-namespace.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch89: kvm-virtiofsd-move-to-an-empty-network-namespace.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch90: kvm-virtiofsd-move-to-a-new-pid-namespace.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch91: kvm-virtiofsd-add-seccomp-whitelist.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch92: kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch93: kvm-virtiofsd-cap-ng-helpers.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch94: kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch95: kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch96: kvm-virtiofsd-fix-libfuse-information-leaks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch97: kvm-virtiofsd-add-syslog-command-line-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch98: kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch99: kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch100: kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch101: kvm-virtiofsd-Handle-reinit.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch102: kvm-virtiofsd-Handle-hard-reboot.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch103: kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch104: kvm-vhost-user-Print-unexpected-slave-message-types.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch105: kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch106: kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch107: kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch108: kvm-virtiofsd-passthrough_ll-control-readdirplus.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch109: kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch110: kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch111: kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch112: kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch113: kvm-virtiofsd-passthrough_ll-use-hashtable.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch114: kvm-virtiofsd-Clean-up-inodes-on-destroy.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch115: kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch116: kvm-virtiofsd-fix-error-handling-in-main.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch117: kvm-virtiofsd-cleanup-allocated-resource-in-se.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch118: kvm-virtiofsd-fix-memory-leak-on-lo.source.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch119: kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch120: kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch121: kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch122: kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch123: kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch124: kvm-virtiofsd-Support-remote-posix-locks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch125: kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch126: kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch127: kvm-virtiofsd-make-lo_release-atomic.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch128: kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch129: kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch130: kvm-libvhost-user-Fix-some-memtable-remap-cases.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch131: kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch132: kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch133: kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch134: kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch135: kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch136: kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch137: kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch138: kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch139: kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch140: kvm-virtiofsd-process-requests-in-a-thread-pool.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch141: kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch142: kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch143: kvm-virtiofsd-add-thread-pool-size-NUM-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch144: kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch145: kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch146: kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch147: kvm-virtiofsd-add-some-options-to-the-help-message.patch +# For bz#1776638 - Guest failed to boot up after system_reset 20 times +Patch148: kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch +# For bz#1776638 - Guest failed to boot up after system_reset 20 times +Patch149: kvm-xics-Don-t-deassert-outputs.patch +# For bz#1776638 - Guest failed to boot up after system_reset 20 times +Patch150: kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch +# For bz#1787395 - qemu-trace-stap list : TypeError: startswith first arg must be bytes or a tuple of bytes, not str +Patch151: kvm-trace-update-qemu-trace-stap-to-Python-3.patch +# For bz#1794503 - CVE-2020-1711 qemu-kvm: QEMU: block: iscsi: OOB heap access via an unexpected response of iSCSI Server [rhel-av-8.2.0] +Patch153: kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch +# For bz#1787444 - Broken postcopy migration with vTPM device +Patch154: kvm-tpm-ppi-page-align-PPI-RAM.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch155: kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch156: kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch157: kvm-tests-arm-cpu-features-Check-feature-default-values.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch158: kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch159: kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch +# For bz#1529231 - [q35] VM hangs after migration with 200 vCPUs +Patch160: kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch +# For bz#1529231 - [q35] VM hangs after migration with 200 vCPUs +Patch161: kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch +# For bz#1529231 - [q35] VM hangs after migration with 200 vCPUs +Patch162: kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch +# For bz#1779078 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) +# For bz#1787291 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) [rhel-8.1.0.z] +# For bz#1779078 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) +# For bz#1779078 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) +Patch163: kvm-i386-Resolve-CPU-models-to-v1-by-default.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch164: kvm-iotests-Support-job-complete-in-run_job.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch165: kvm-iotests-Create-VM.blockdev_create.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch166: kvm-block-Activate-recursively-even-for-already-active-n.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch167: kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch168: kvm-iotests-Test-external-snapshot-with-VM-state.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch169: kvm-iotests.py-Let-wait_migration-wait-even-more.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch170: kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch171: kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch172: kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch173: kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch174: kvm-backup-top-Begin-drain-earlier.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch175: kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch176: kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch177: kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch178: kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch +# For bz#1801320 - aarch64: backport query-cpu-model-expansion and adjvtime document fixes +Patch179: kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch +# For bz#1801320 - aarch64: backport query-cpu-model-expansion and adjvtime document fixes +Patch180: kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch +# For bz#1796240 - Enable hw accelerated cache-count-flush by default for POWER9 DD2.3 cpus +Patch181: kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch +# For bz#1798994 - CVE-2020-8608 qemu-kvm: QEMU: Slirp: potential OOB access due to unsafe snprintf() usages [rhel-av-8.2.0] +Patch182: kvm-util-add-slirp_fmt-helpers.patch +# For bz#1798994 - CVE-2020-8608 qemu-kvm: QEMU: Slirp: potential OOB access due to unsafe snprintf() usages [rhel-av-8.2.0] +Patch183: kvm-tcp_emu-fix-unsafe-snprintf-usages.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch184: kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch185: kvm-virtio-make-virtio_delete_queue-idempotent.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch186: kvm-virtio-reset-region-cache-when-on-queue-deletion.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch187: kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch +# For bz#1805334 - vhost-user/50-qemu-gpu.json is not valid JSON +Patch188: kvm-vhost-user-gpu-Drop-trailing-json-comma.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch189: kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch190: kvm-target-i386-add-a-ucode-rev-property.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch191: kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch192: kvm-target-i386-fix-TCG-UCODE_REV-access.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch193: kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch194: kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch +# For bz#1703907 - [upstream]QEMU coredump when converting to qcow2: external data file images on block devices with copy_offloading +Patch195: kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch +# For bz#1794692 - Mirror block job stops making progress +Patch196: kvm-mirror-Store-MirrorOp.co-for-debuggability.patch +# For bz#1794692 - Mirror block job stops making progress +Patch197: kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch +# For bz#1782529 - Windows Update Enablement with default smbios strings in qemu +Patch198: kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch199: kvm-migration-multifd-clean-pages-after-filling-packet.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch200: kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch201: kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch202: kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch203: kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch204: kvm-qemu-file-Don-t-do-IO-after-shutdown.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch205: kvm-migration-Don-t-send-data-if-we-have-stopped.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch206: kvm-migration-Create-migration_is_running.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch207: kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch208: kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch +# For bz#1797064 - virtiofsd: Fixes +Patch209: kvm-virtiofsd-Remove-fuse_req_getgroups.patch +# For bz#1797064 - virtiofsd: Fixes +Patch210: kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch +# For bz#1797064 - virtiofsd: Fixes +Patch211: kvm-virtiofsd-load_capng-missing-unlock.patch +# For bz#1797064 - virtiofsd: Fixes +Patch212: kvm-virtiofsd-do_read-missing-NULL-check.patch +# For bz#1797064 - virtiofsd: Fixes +Patch213: kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch +# For bz#1797064 - virtiofsd: Fixes +Patch214: kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch +# For bz#1797064 - virtiofsd: Fixes +Patch215: kvm-virtiofsd-Fix-xattr-operations.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch216: kvm-block-nbd-Fix-hang-in-.bdrv_close.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch217: kvm-block-Generic-file-creation-fallback.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch218: kvm-file-posix-Drop-hdev_co_create_opts.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch219: kvm-iscsi-Drop-iscsi_co_create_opts.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch220: kvm-iotests-Add-test-for-image-creation-fallback.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch221: kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch222: kvm-iotests-Use-complete_and_wait-in-155.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch223: kvm-block-Introduce-bdrv_reopen_commit_post-step.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch224: kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch225: kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch226: kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch227: kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch228: kvm-block-Make-bdrv_get_cumulative_perm-public.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch229: kvm-block-Relax-restrictions-for-blockdev-snapshot.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch230: kvm-iotests-Fix-run_job-with-use_log-False.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch231: kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch232: kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch233: kvm-iotests-Add-iothread-cases-to-155.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch234: kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch +# For bz#1809380 - guest hang during reboot process after migration from RHEl7.8 to RHEL8.2.0. +Patch235: kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch +# For bz#1814336 - [POWER9] QEMU migration-test triggers a kernel warning +Patch236: kvm-migration-Rate-limit-inside-host-pages.patch +# For bz#1811670 - Unneeded qemu-guest-agent dependency on pixman +Patch237: kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch +# For bz#1816007 - qemu-img convert failed to convert with block device as target +Patch238: kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch +# For bz#1816007 - qemu-img convert failed to convert with block device as target +Patch239: kvm-block-trickle-down-the-fallback-image-creation-funct.patch +# For bz#1794692 - Mirror block job stops making progress +Patch240: kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch +# For bz#1794692 - Mirror block job stops making progress +Patch241: kvm-mirror-Wait-only-for-in-flight-operations.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch242: kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch243: kvm-replication-assert-we-own-context-before-job_cancel_.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch244: kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch245: kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch246: kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch247: kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch +# For bz#1822682 - QEMU-4.2 fails to start a VM on Azure +Patch248: kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch +# For bz#1790899 - [RFE] QEMU devices should have the option to enable/disable hotplug/unplug +Patch249: kvm-pcie_root_port-Add-hotplug-disabling-option.patch +# For bz#1816793 - 'edid' compat handling missing for virtio-gpu-ccw +Patch250: kvm-compat-disable-edid-for-virtio-gpu-ccw.patch +# For bz#1820531 - qmp command query-pci get wrong result after hotplug device under hotplug=off controller +Patch251: kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch +# For bz#1820531 - qmp command query-pci get wrong result after hotplug device under hotplug=off controller +Patch252: kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch253: kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch254: kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch255: kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch256: kvm-virtiofsd-jail-lo-proc_self_fd.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch257: kvm-virtiofsd-Show-submounts.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch258: kvm-virtiofsd-only-retain-file-system-capabilities.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch259: kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch +# For bz#1775462 - Creating luks-inside-qcow2 images with cluster_size=2k/4k will get a corrupted image +Patch260: kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch261: kvm-numa-remove-not-needed-check.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch262: kvm-numa-properly-check-if-numa-is-supported.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch263: kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch264: kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch265: kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch266: kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch267: kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch268: kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch269: kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch270: kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch271: kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch +# For bz#1813940 - CVE-2020-10702 virt:8.1/qemu-kvm: qemu: weak signature generation in Pointer Authentication support for ARM [rhel-av-8] +Patch272: kvm-target-arm-Fix-PAuth-sbox-functions.patch +# For bz#1749737 - CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-av-8] +Patch273: kvm-Don-t-leak-memory-when-reallocation-fails.patch +# For bz#1749737 - CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-av-8] +Patch274: kvm-Replace-remaining-malloc-free-user-with-glib.patch +# For bz#1839030 - RFE: enable the "memfd" memory backend +Patch275: kvm-Revert-RHEL-disable-hostmem-memfd.patch +# For bz#1827630 - volume creation leaving uncleaned stuff behind on error (vol-clone/libvirt/qemu-kvm) +Patch276: kvm-block-introducing-bdrv_co_delete_file-interface.patch +# For bz#1827630 - volume creation leaving uncleaned stuff behind on error (vol-clone/libvirt/qemu-kvm) +Patch277: kvm-block.c-adding-bdrv_co_delete_file.patch +# For bz#1827630 - volume creation leaving uncleaned stuff behind on error (vol-clone/libvirt/qemu-kvm) +Patch278: kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch +# For bz#1513681 - [Intel 8.2.1 Feat] qemu-kvm PT VMX -- Fast Train +Patch279: kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch +# For bz#1841038 - qemu-img: /var/tmp/v2vovl56bced.qcow2: CURL: Error opening file: Server does not support 'range' (byte ranges) with HTTP/2 server in VMware ESXi 7 +Patch280: kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch +# For bz#1841038 - qemu-img: /var/tmp/v2vovl56bced.qcow2: CURL: Error opening file: Server does not support 'range' (byte ranges) with HTTP/2 server in VMware ESXi 7 +Patch281: kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch282: kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch283: kvm-iotests-Let-_make_test_img-parse-its-parameters.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch284: kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch285: kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch286: kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch287: kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch288: kvm-qemu-img-Add-bitmap-sub-command.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch289: kvm-iotests-Fix-test-178.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch290: kvm-qcow2-Expose-bitmaps-size-during-measure.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch291: kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch292: kvm-qemu-img-Add-convert-bitmaps-option.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch293: kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch294: kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch295: kvm-iotests-don-t-use-format-for-drive_add.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch296: kvm-iotests-055-refactor-compressed-backup-to-vmdk.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch297: kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch298: kvm-backup-Improve-error-for-bdrv_getlength-failure.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch299: kvm-backup-Make-sure-that-source-and-target-size-match.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch300: kvm-iotests-Backup-with-different-source-target-size.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch301: kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch302: kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch303: kvm-mirror-Make-sure-that-source-and-target-size-match.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch304: kvm-iotests-Mirror-with-different-source-target-size.patch +# For bz#1841068 - RFE: please support the "ramfb" display device model +Patch305: kvm-enable-ramfb.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch306: kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch307: kvm-block-Add-flags-to-bdrv-_co-_truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch308: kvm-block-backend-Add-flags-to-blk_truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch309: kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch310: kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch311: kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch312: kvm-block-truncate-Don-t-make-backing-file-data-visible.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch313: kvm-iotests-Add-qemu_io_log.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch314: kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch315: kvm-iotests-Test-committing-to-short-backing-file.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch316: kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch317: kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch318: kvm-i386-Add-macro-for-stibp.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch319: kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch320: kvm-i386-Add-new-CPU-model-Cooperlake.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch321: kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch +# For bz#1845384 - CVE-2020-10761 virt:8.2/qemu-kvm: QEMU: nbd: reachable assertion failure in nbd_negotiate_send_rep_verr via remote client [rhel-av-8] +Patch322: kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch +# For bz#1845384 - CVE-2020-10761 virt:8.2/qemu-kvm: QEMU: nbd: reachable assertion failure in nbd_negotiate_send_rep_verr via remote client [rhel-av-8] +Patch323: kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch +# For bz#1820531 - qmp command query-pci get wrong result after hotplug device under hotplug=off controller +Patch324: kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch +# For bz#1840342 - [Intel 8.2.1 Bug] qemu-kvm Add ARCH_CAPABILITIES to Icelake-Server cpu model - Fast Train +Patch325: kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch327: kvm-linux-headers-update-kvm.h.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch328: kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch329: kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch330: kvm-s390x-Move-initial-reset.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch331: kvm-s390x-Move-clear-reset.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch332: kvm-s390x-Beautify-diag308-handling.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch333: kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch334: kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch335: kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch336: kvm-pc-bios-s390x-Fix-reset-psw-mask.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch337: kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch338: kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch339: kvm-s390x-Add-missing-vcpu-reset-functions.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch340: kvm-s390-sclp-improve-special-wait-psw-logic.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch341: kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch342: kvm-s390-ipl-sync-back-loadparm.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch343: kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch344: kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch345: kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch346: kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch347: kvm-s390x-protvirt-Support-unpack-facility.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch348: kvm-s390x-protvirt-Add-migration-blocker.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch349: kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch350: kvm-s390x-protvirt-KVM-intercept-changes.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch351: kvm-s390x-Add-SIDA-memory-ops.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch352: kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch353: kvm-s390x-protvirt-SCLP-interpretation.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch354: kvm-s390x-protvirt-Set-guest-IPL-PSW.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch355: kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch356: kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch357: kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch358: kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch359: kvm-s390x-Add-unpack-facility-feature-to-GA1.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch360: kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch361: kvm-s390x-pv-Retry-ioctls-on-EINTR.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch362: kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch363: kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch +# For bz#1756946 - [zKVM] Re-enable KVM_CAP_S390_AIS for new machine types +Patch364: kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch +# For bz#1756946 - [zKVM] Re-enable KVM_CAP_S390_AIS for new machine types +Patch365: kvm-introduce-kvm_kernel_irqchip_-functions.patch +# For bz#1756946 - [zKVM] Re-enable KVM_CAP_S390_AIS for new machine types +Patch366: kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch +# For bz#1823275 - RHEL8.1 - GPU Numa nodes not visible in guest post the pass-through. +Patch367: kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch368: kvm-vfio-ccw-Fix-error-message.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch369: kvm-vfio-ccw-allow-non-prefetch-ORBs.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch370: kvm-linux-headers-support-vfio-ccw-features.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch371: kvm-vfio-ccw-Refactor-cleanup-of-regions.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch372: kvm-vfio-ccw-Add-support-for-the-schib-region.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch373: kvm-vfio-ccw-Refactor-ccw-irq-handler.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch374: kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch375: kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch376: kvm-config-enable-VFIO_CCW.patch +Patch377: kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch +Patch378: kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch +# For bz#1838070 - CVE-2020-1983 virt:rhel/qemu-kvm: QEMU: slirp: use-after-free in ip_reass() function in ip_input.c [rhel-8] +Patch379: kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch +# For bz#1835390 - qemu promote host does not support 'EDX.npt' and 'EDX.nrip-save' when test with Q35 machine type on EPYC host +Patch380: kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch +# For bz#1854092 - kvm-unit-tests: tcg smp FAIL +Patch381: kvm-s390x-sigp-Fix-sense-running-reporting.patch +# For bz#1854092 - kvm-unit-tests: tcg smp FAIL +Patch382: kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch +Patch383: kvm-virtio-net-fix-removal-of-failover-device.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch384: kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch385: kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch386: kvm-iotests-026-Test-EIO-on-allocation-in-a-data-file.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch387: kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch +# For bz#1780385 - [RFE] AMD EPYC-Rome support for KVM / QEMU guest +Patch388: kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch +# For bz#1689341 - QEMU should report an error and return failure if AMD SEV is not enabled in the kernel +Patch389: kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch +# For bz#1689341 - QEMU should report an error and return failure if AMD SEV is not enabled in the kernel +Patch390: kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch +# For bz#1863034 - RHEL8.3 Beta - Secure Execution: Unable to start Qemu with "-no-reboot" option (qemu-kvm) +Patch391: kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch +# For bz#1869710 - CVE-2020-14364 qemu-kvm: QEMU: usb: out-of-bounds r/w access issue while processing usb packets [rhel-8.3.0] +Patch392: kvm-usb-fix-setup_len-init-CVE-2020-14364.patch +# For bz#1755075 - [qemu-guest-agent] fsinfo doesn't return disk info on s390x +Patch393: kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch +# For bz#1755075 - [qemu-guest-agent] fsinfo doesn't return disk info on s390x +Patch394: kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch +# For bz#1755075 - [qemu-guest-agent] fsinfo doesn't return disk info on s390x +Patch395: kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch +# For bz#1874780 - -prom-env does not validate input +Patch396: kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch397: kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch398: kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch399: kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch400: kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch401: kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch402: kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch403: kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch404: kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch405: kvm-aio-posix-completely-stop-polling-when-disabled.patch +# For bz#1884531 - qemu-ga aborts after guest-shutdown command +Patch406: kvm-qga-fix-assert-regression-on-guest-shutdown.patch +# For bz#1857733 - [IBM 8.4 FEAT] KVM: Add support for virtio-fs on s390x - qemu part +Patch407: kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch +# For bz#1857733 - [IBM 8.4 FEAT] KVM: Add support for virtio-fs on s390x - qemu part +Patch408: kvm-virtio-add-vhost-user-fs-ccw-device.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch410: kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch411: kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch412: kvm-s390-sclp-rework-sclp-boundary-checks.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch413: kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch414: kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch415: kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch416: kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch417: kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch418: kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch419: kvm-s390-guest-support-for-diagnose-0x318.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch420: kvm-s390x-pv-Remove-sclp-boundary-checks.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch421: kvm-s390x-pv-Fix-diag318-PV-fencing.patch +# For bz#1659412 - [IBM 8.4 FEAT] KVM enablement for enhanced hardware diagnose data of guest kernel on s390x - qemu part +Patch422: kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch +# For bz#1898700 - qemu-kvm for RHEL-8.4 doesn't build due to a possible incompatibility with systemtap-sdt-devel-4.4-1 +Patch423: kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch +# For bz#1860994 - CVE-2020-16092 virt:rhel/qemu-kvm: QEMU: reachable assertion failure in net_tx_pkt_add_raw_fragment() in hw/net/net_tx_pkt.c [rhel-8] +Patch424: kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch +# For bz#1880546 - qemu use SCMP_ACT_TRAP even SCMP_ACT_KILL_PROCESS is available +Patch425: kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch426: kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch427: kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch428: kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch429: kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch430: kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch431: kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch +# For bz#1902237 - CVE-2020-29129 CVE-2020-29130 virt:rhel/qemu-kvm: QEMU: slirp: out-of-bounds access while processing ARP/NCSI packets [rhel-8] +Patch432: kvm-slirp-check-pkt_len-before-reading-protocol-header.patch +# For bz#1905386 - RHEL8.3 - s390x/s390-virtio-ccw: Reset PCI devices during subsystem reset (qemu-kvm) +Patch433: kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch434: kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch435: kvm-error-Fix-examples-in-error.h-s-big-comment.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch436: kvm-error-Improve-error.h-s-big-comment.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch437: kvm-error-Document-Error-API-usage-rules.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch438: kvm-error-New-macro-ERRP_GUARD.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch439: kvm-qga-add-command-guest-get-disks.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch440: kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch441: kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch442: kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch443: kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch +# For bz#1910267 - There is no soft link '/etc/qemu-kvm/fsfreeze-hook' +Patch444: kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch +# For bz#1910326 - Incorrect hostname returned by qga command 'guest-get-host-name' +Patch445: kvm-qga-rename-Error-parameter-to-more-common-errp.patch +# For bz#1910326 - Incorrect hostname returned by qga command 'guest-get-host-name' +Patch446: kvm-util-Introduce-qemu_get_host_name.patch +# For bz#1910326 - Incorrect hostname returned by qga command 'guest-get-host-name' +Patch447: kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch449: kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch450: kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch451: kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch452: kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch453: kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch454: kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch455: kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch456: kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch457: kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch458: kvm-memory-Add-IOMMUTLBEvent.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch459: kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch460: kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch461: kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch +# For bz#1904393 - CVE-2020-27821 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in msix_table_mmio_write() in hw/pci/msix.c [rhel-8] +Patch462: kvm-memory-clamp-cached-translation-in-case-it-points-to.patch +# For bz#1898628 - CVE-2020-25723 virt:rhel/qemu-kvm: QEMU: assertion failure through usb_packet_unmap() in hw/usb/hcd-ehci.c [rhel-8] +Patch463: kvm-hw-ehci-check-return-value-of-usb_packet_map.patch +# For bz#1903070 - CVE-2020-25707 CVE-2020-28916 virt:rhel/qemu-kvm: various flaws [rhel-8] +Patch464: kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch465: kvm-linux-headers-add-vfio-DMA-available-capability.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch466: kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch467: kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch468: kvm-vfio-Find-DMA-available-capability.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch469: kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch470: kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch471: kvm-s390x-fix-build-for-without-default-devices.patch +# For bz#1918054 - CVE-2020-10756 virt:rhel/qemu-kvm: QEMU: slirp: networking out-of-bounds read information disclosure vulnerability [rhel-8.4.0] +Patch472: kvm-Drop-bogus-IPv6-messages.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch473: kvm-spapr-Improve-handling-of-fdt-buffer-size.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch474: kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch475: kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch476: kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch477: kvm-spapr-Allow-memory-unplug-to-always-succeed.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch478: kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch +# For bz#1834281 - qemu-img convert abort when converting image with unaligned size +Patch479: kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch +# For bz#1834281 - qemu-img convert abort when converting image with unaligned size +Patch480: kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch +# For bz#1912974 - CVE-2020-11947 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in iscsi_aio_ioctl_cb() in block/iscsi.c may lead to information disclosure [rhel-8] +Patch481: kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch +# For bz#1919111 - CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0] +Patch482: kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch +# For bz#1919111 - CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0] +Patch483: kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch +# For bz#1919111 - CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0] +Patch484: kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch +# For bz#1912891 - [ppc64le] --disk cdimage.iso,bus=usb fails to boot +Patch486: kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch487: kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch488: kvm-target-i386-add-fast-short-REP-MOV-support.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch489: kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch490: kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch +# For bz#1917451 - CVE-2020-29443 virt:rhel/qemu-kvm: QEMU: ide: atapi: OOB access while processing read commands [rhel-8.4.0] +Patch491: kvm-ide-atapi-check-logical-block-address-and-read-size-.patch +# For bz#1892350 - CVE-2020-27617 virt:rhel/qemu-kvm: QEMU: net: an assert failure via eth_get_gso_type [rhel-8.5.0] +Patch492: kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch +# For bz#1930092 - CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0] +Patch493: kvm-e1000-fail-early-for-evil-descriptor.patch +# For bz#1859175 - CVE-2020-15859 virt:rhel/qemu-kvm: QEMU: net: e1000e: use-after-free while sending packets [rhel-8] +Patch494: kvm-net-forbid-the-reentrant-RX.patch +# For bz#1855250 - qemu-img convert uses possibly slow pre-zeroing on block storage +Patch495: kvm-qemu-img-convert-Don-t-pre-zero-images.patch +# For bz#1932823 - after upgrade from 4.3 to 4.4 audio stops working in guests after couple of seconds +Patch496: kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch +# For bz#1925430 - CVE-2021-20221 virt:rhel/qemu-kvm: qemu: out-of-bound heap buffer access via an interrupt ID field [rhel-8.5.0] +Patch497: kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch498: kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch499: kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch500: kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch501: kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch502: kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch503: kvm-softmmu-memory-Log-invalid-memory-accesses.patch +# For bz#1940450 - RHEL8.5 - Mediated Device already in use by same domain we are booting (vfio-ccw/Multipath Testing) (kvm) - qemu-kvm part (also has kernel and libvirt parts) +Patch504: kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch +# For bz#1940450 - RHEL8.5 - Mediated Device already in use by same domain we are booting (vfio-ccw/Multipath Testing) (kvm) - qemu-kvm part (also has kernel and libvirt parts) +Patch505: kvm-vfio-ccw-Connect-the-device-request-notifier.patch +# For bz#1942880 - RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm) +Patch506: kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch +# For bz#1942880 - RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm) +Patch507: kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch +# For bz#1942880 - RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm) +Patch508: kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch +# For bz#1877163 - [FJ 8.3 Bug] The progress bar of the "virt-clone --nonsparse" command shows the progress rate exceeding 100%. +Patch509: kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch +# For bz#1944861 - Qemu-img convert fails when source image is on gpfs +Patch510: kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch +# For bz#1969768 - [ppc64le] Hotplug vcpu device hit call trace:[qemu output] KVM: unknown exit, hardware reason 7fff9ce87ed8 +Patch511: kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch +# For bz#1969768 - [ppc64le] Hotplug vcpu device hit call trace:[qemu output] KVM: unknown exit, hardware reason 7fff9ce87ed8 +Patch512: kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch +# For bz#1967914 - [virtio-fs] virtiofsd quit when coping file to a folder in virtio-fs mounted volume(windows guest) +Patch513: kvm-virtiofsd-Whitelist-fchmod.patch +# For bz#1957866 - RHEL8.4 - EEH capability disabled on KVM guest and recovery of PCI passthru device fails(CX5 / mlx5_core) (qemu-kvm) +Patch514: kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch +# For bz#1970912 - Deployment fails with "Invalid or missing agent token received" +Patch515: kvm-Compress-lines-for-immediate-return.patch +# For bz#1970912 - Deployment fails with "Invalid or missing agent token received" +Patch516: kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch517: kvm-net-introduce-qemu_receive_packet.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch518: kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch519: kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch520: kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch521: kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch522: kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch523: kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch524: kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch525: kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch526: kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch527: kvm-qga-add-ssh-add-remove-authorized-keys.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch528: kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch529: kvm-qga-add-ssh-get-authorized-keys.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch530: kvm-Add-mtod_check.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch531: kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +Patch532: kvm-bootp-check-bootp_input-buffer-size.patch +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +Patch533: kvm-upd6-check-udp6_input-buffer-size.patch +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +Patch534: kvm-tftp-check-tftp_input-buffer-size.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch535: kvm-tftp-introduce-a-header-structure.patch +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch536: kvm-udp-check-upd_input-buffer-size.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch537: kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch +# For bz#1982134 - QEMU core dump while booting guest with a non-exist fd on tap +Patch538: kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch +# For bz#1982134 - QEMU core dump while booting guest with a non-exist fd on tap +Patch539: kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch +# For bz#1969848 - qemu-img convert hangs on aarch64 +Patch540: kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch +# For bz#1969848 - qemu-img convert hangs on aarch64 +Patch541: kvm-async-use-explicit-memory-barriers.patch +# For bz#1967496 - [virtio-fs] nfs/xfstest generic/089 generic/478 generic/632 failed +Patch542: kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch +# For bz#1967496 - [virtio-fs] nfs/xfstest generic/089 generic/478 generic/632 failed +Patch543: kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch +# For bz#1994041 - qemu-kvm scsi: change default passthrough timeout to non-infinite +Patch544: kvm-scsi-make-io_timeout-configurable.patch +# For bz#2025605 - CVE-2021-3930 virt:rhel/qemu-kvm: QEMU: off-by-one error in mode_sense_page() in hw/scsi/scsi-disk.c [rhel-8.5.0.z] +Patch545: kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch +# For bz#2025011 - CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0.z] +Patch546: kvm-e1000-fix-tx-re-entrancy-problem.patch +# For bz#2048627 - CVE-2022-0358 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 [rhel-8.5.0.z] +Patch547: kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch + +BuildRequires: wget +BuildRequires: rpm-build +BuildRequires: zlib-devel +BuildRequires: glib2-devel +BuildRequires: which +BuildRequires: gnutls-devel +BuildRequires: cyrus-sasl-devel +BuildRequires: libtool +BuildRequires: libaio-devel +BuildRequires: rsync +BuildRequires: python3-devel +BuildRequires: pciutils-devel +BuildRequires: libiscsi-devel +BuildRequires: ncurses-devel +BuildRequires: libattr-devel +BuildRequires: libusbx-devel >= 1.0.22 +%if %{have_usbredir} +BuildRequires: usbredir-devel >= 0.7.1 +%endif +BuildRequires: texinfo +BuildRequires: python3-sphinx +%if %{have_spice} +BuildRequires: spice-protocol >= 0.12.12 +BuildRequires: spice-server-devel >= 0.12.8 +BuildRequires: libcacard-devel +# For smartcard NSS support +BuildRequires: nss-devel +%endif +BuildRequires: libseccomp-devel >= 2.4.0 +# For network block driver +BuildRequires: libcurl-devel +BuildRequires: libssh-devel +BuildRequires: librados-devel +BuildRequires: librbd-devel +%if %{have_gluster} +# For gluster block driver +BuildRequires: glusterfs-api-devel +BuildRequires: glusterfs-devel +%endif +# We need both because the 'stap' binary is probed for by configure +BuildRequires: systemtap +BuildRequires: systemtap-sdt-devel +# For VNC PNG support +BuildRequires: libpng-devel +# For uuid generation +BuildRequires: libuuid-devel +# For BlueZ device support +BuildRequires: bluez-libs-devel +# For Braille device support +BuildRequires: brlapi-devel +# For test suite +BuildRequires: check-devel +# For virtiofs +BuildRequires: libcap-ng-devel +# Hard requirement for version >= 1.3 +BuildRequires: pixman-devel +# Documentation requirement +BuildRequires: perl-podlators +BuildRequires: texinfo +BuildRequires: python3-sphinx +# For rdma +%if 0%{?have_librdma} +BuildRequires: rdma-core-devel +%endif +%if %{have_fdt} +BuildRequires: libfdt-devel >= 1.6.0 +%endif +# iasl and cpp for acpi generation (not a hard requirement as we can use +# pre-compiled files, but it's better to use this) +%ifarch %{ix86} x86_64 +BuildRequires: iasl +BuildRequires: cpp +%endif +# For compressed guest memory dumps +BuildRequires: lzo-devel snappy-devel +# For NUMA memory binding +%ifnarch s390x +BuildRequires: numactl-devel +%endif +BuildRequires: libgcrypt-devel +# qemu-pr-helper multipath support (requires libudev too) +BuildRequires: device-mapper-multipath-devel +BuildRequires: systemd-devel +# used by qemu-bridge-helper and qemu-pr-helper +BuildRequires: libcap-ng-devel + +BuildRequires: diffutils +%ifarch x86_64 +BuildRequires: libpmem-devel +Requires: libpmem +%endif + +# qemu-keymap +BuildRequires: pkgconfig(xkbcommon) + +# For s390-pgste flag +%ifarch s390x +BuildRequires: binutils >= 2.27-16 +%endif + +%if %{have_opengl} +BuildRequires: pkgconfig(epoxy) +BuildRequires: pkgconfig(libdrm) +BuildRequires: pkgconfig(gbm) +Requires: mesa-libGL +Requires: mesa-libEGL +Requires: mesa-dri-drivers +%endif + +BuildRequires: perl-Test-Harness + +Requires: qemu-kvm-core = %{epoch}:%{version}-%{release} +%rhev_ma_conflicts qemu-kvm + +%{requires_all_modules} + +%define qemudocdir %{_docdir}/%{name} + +%description +qemu-kvm is an open source virtualizer that provides hardware +emulation for the KVM hypervisor. qemu-kvm acts as a virtual +machine monitor together with the KVM kernel modules, and emulates the +hardware for a full system such as a PC and its associated peripherals. + + +%package -n qemu-kvm-core +Summary: qemu-kvm core components +Requires: qemu-img = %{epoch}:%{version}-%{release} +%ifarch %{ix86} x86_64 +Requires: seabios-bin >= 1.10.2-1 +Requires: sgabios-bin +Requires: edk2-ovmf +%endif +%ifarch aarch64 +Requires: edk2-aarch64 +%endif + +%ifnarch aarch64 s390x +Requires: seavgabios-bin >= 1.12.0-3 +Requires: ipxe-roms-qemu >= 20170123-1 +%endif +%ifarch %{power64} +Requires: SLOF >= %{SLOF_gittagdate}-1.git%{SLOF_gittagcommit} +%endif +Requires: %{name}-common = %{epoch}:%{version}-%{release} +Requires: libseccomp >= 2.4.0 +# For compressed guest memory dumps +Requires: lzo snappy +%if %{have_kvm_setup} +Requires(post): systemd-units +Requires(preun): systemd-units + %ifarch %{power64} +Requires: powerpc-utils + %endif +%endif +Requires: libusbx >= 1.0.19 +%if %{have_usbredir} +Requires: usbredir >= 0.7.1 +%endif +%if %{have_fdt} +Requires: libfdt >= 1.6.0 +%endif + +%rhev_ma_conflicts qemu-kvm + +%description -n qemu-kvm-core +qemu-kvm is an open source virtualizer that provides hardware +emulation for the KVM hypervisor. qemu-kvm acts as a virtual +machine monitor together with the KVM kernel modules, and emulates the +hardware for a full system such as a PC and its associated peripherals. + + +%package -n qemu-img +Summary: QEMU command line tool for manipulating disk images +Group: Development/Tools + +%rhev_ma_conflicts qemu-img + +%description -n qemu-img +This package provides a command line tool for manipulating disk images. + +%package -n qemu-kvm-common +Summary: QEMU common files needed by all QEMU targets +Group: Development/Tools +Requires(post): /usr/bin/getent +Requires(post): /usr/sbin/groupadd +Requires(post): /usr/sbin/useradd +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units + +%rhev_ma_conflicts qemu-kvm-common + +%description -n qemu-kvm-common +qemu-kvm is an open source virtualizer that provides hardware emulation for +the KVM hypervisor. + +This package provides documentation and auxiliary programs used with qemu-kvm. + + +%package -n qemu-guest-agent +Summary: QEMU guest agent +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units + +%description -n qemu-guest-agent +qemu-kvm is an open source virtualizer that provides hardware emulation for +the KVM hypervisor. + +This package provides an agent to run inside guests, which communicates +with the host over a virtio-serial channel named "org.qemu.guest_agent.0" + +This package does not need to be installed on the host OS. + +%package tests +Summary: tests for the qemu-kvm package +Requires: %{name} = %{epoch}:%{version}-%{release} + +%define testsdir %{_libdir}/%{name}/tests-src + +%description tests +The qemu-kvm-tests rpm contains tests that can be used to verify +the functionality of the installed qemu-kvm package + +Install this package if you want access to the avocado_qemu +tests, or qemu-iotests. + +%package block-curl +Summary: QEMU CURL block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-curl +This package provides the additional CURL block driver for QEMU. + +Install this package if you want to access remote disks over +http, https, ftp and other transports provided by the CURL library. + + +%if %{have_gluster} +%package block-gluster +Summary: QEMU Gluster block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} +%description block-gluster +This package provides the additional Gluster block driver for QEMU. + +Install this package if you want to access remote Gluster storage. +%endif + + +%package block-iscsi +Summary: QEMU iSCSI block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-iscsi +This package provides the additional iSCSI block driver for QEMU. + +Install this package if you want to access iSCSI volumes. + + +%package block-rbd +Summary: QEMU Ceph/RBD block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-rbd +This package provides the additional Ceph/RBD block driver for QEMU. + +Install this package if you want to access remote Ceph volumes +using the rbd protocol. + + +%package block-ssh +Summary: QEMU SSH block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-ssh +This package provides the additional SSH block driver for QEMU. + +Install this package if you want to access remote disks using +the Secure Shell (SSH) protocol. + + +%prep +%setup -n qemu-%{version} +%autopatch -p1 + +%build +%global buildarch %{kvm_target}-softmmu + +# --build-id option is used for giving info to the debug packages. +buildldflags="VL_LDFLAGS=-Wl,--build-id" + +%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle + +%if 0%{have_gluster} + %global block_drivers_list %{block_drivers_list},gluster +%endif + +./configure \ + --prefix="%{_prefix}" \ + --libdir="%{_libdir}" \ + --sysconfdir="%{_sysconfdir}" \ + --interp-prefix=%{_prefix}/qemu-%M \ + --localstatedir="%{_localstatedir}" \ + --docdir="%{qemudocdir}" \ + --libexecdir="%{_libexecdir}" \ + --extra-ldflags="-Wl,--build-id -Wl,-z,relro -Wl,-z,now" \ + --extra-cflags="%{optflags}" \ + --with-pkgversion="%{name}-%{version}-%{release}" \ + --with-confsuffix=/"%{name}" \ + --firmwarepath=%{_prefix}/share/qemu-firmware \ +%if 0%{have_fdt} + --enable-fdt \ +%else + --disable-fdt \ + %endif +%if 0%{have_gluster} + --enable-glusterfs \ +%else + --disable-glusterfs \ +%endif + --enable-guest-agent \ +%ifnarch s390x + --enable-numa \ +%else + --disable-numa \ +%endif + --enable-rbd \ +%if 0%{have_librdma} + --enable-rdma \ +%else + --disable-rdma \ +%endif + --disable-pvrdma \ + --enable-seccomp \ +%if 0%{have_spice} + --enable-spice \ + --enable-smartcard \ +%else + --disable-spice \ + --disable-smartcard \ +%endif +%if 0%{have_opengl} + --enable-opengl \ +%else + --disable-opengl \ +%endif +%if 0%{have_usbredir} + --enable-usb-redir \ +%else + --disable-usb-redir \ +%endif + --disable-tcmalloc \ +%ifarch x86_64 + --enable-libpmem \ +%else + --disable-libpmem \ +%endif + --enable-vhost-user \ +%ifarch s390x + --enable-vhost-user-fs \ +%endif +%ifarch %{ix86} x86_64 + --enable-avx2 \ +%else + --disable-avx2 \ +%endif + --python=%{__python3} \ + --target-list="%{buildarch}" \ + --block-drv-rw-whitelist=%{block_drivers_list} \ + --audio-drv-list= \ + --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \ + --with-coroutine=ucontext \ + --tls-priority=@QEMU,SYSTEM \ + --disable-bluez \ + --disable-brlapi \ + --enable-cap-ng \ + --enable-coroutine-pool \ + --enable-curl \ + --disable-curses \ + --disable-debug-tcg \ + --enable-docs \ + --disable-gtk \ + --enable-kvm \ + --enable-libiscsi \ + --disable-libnfs \ + --enable-libssh \ + --enable-libusb \ + --disable-bzip2 \ + --enable-linux-aio \ + --disable-live-block-migration \ + --enable-lzo \ + --enable-pie \ + --disable-qom-cast-debug \ + --disable-sdl \ + --enable-snappy \ + --disable-sparse \ + --disable-strip \ + --enable-tpm \ + --enable-trace-backend=dtrace \ + --disable-vde \ + --disable-vhost-scsi \ + --disable-vxhs \ + --disable-virtfs \ + --disable-vnc-jpeg \ + --disable-vte \ + --enable-vnc-png \ + --enable-vnc-sasl \ + --enable-werror \ + --disable-xen \ + --disable-xfsctl \ + --enable-gnutls \ + --enable-gcrypt \ + --disable-nettle \ + --enable-attr \ + --disable-bsd-user \ + --disable-cocoa \ + --enable-debug-info \ + --disable-guest-agent-msi \ + --disable-hax \ + --disable-jemalloc \ + --disable-linux-user \ + --enable-modules \ + --disable-netmap \ + --disable-replication \ + --enable-system \ + --enable-tools \ + --disable-user \ + --enable-vhost-net \ + --enable-vhost-vsock \ + --enable-vnc \ + --enable-mpath \ + --disable-xen-pci-passthrough \ + --enable-tcg \ + --with-git=git \ + --disable-sanitizers \ + --disable-hvf \ + --disable-whpx \ + --enable-malloc-trim \ + --disable-membarrier \ + --disable-vhost-crypto \ + --disable-libxml2 \ + --enable-capstone \ + --disable-git-update \ + --disable-crypto-afalg \ + --disable-debug-mutex \ + --disable-bochs \ + --disable-cloop \ + --disable-dmg \ + --disable-qcow1 \ + --disable-vdi \ + --disable-vvfat \ + --disable-qed \ + --disable-parallels \ + --disable-sheepdog \ + --disable-auth-pam \ + --enable-iconv \ + --disable-lzfse \ + --enable-vhost-kernel \ + --disable-virglrenderer \ + --without-default-devices + +echo "config-host.mak contents:" +echo "===" +cat config-host.mak +echo "===" + +make V=1 %{?_smp_mflags} $buildldflags + +# Setup back compat qemu-kvm binary +%{__python3} scripts/tracetool.py --backend dtrace --format stap \ + --group=all --binary %{_libexecdir}/qemu-kvm --probe-prefix qemu.kvm \ + trace-events-all > qemu-kvm.stp + +%{__python3} scripts/tracetool.py --backends=dtrace --format=log-stap \ + --group=all --binary %{_libexecdir}/qemu-kvm --probe-prefix qemu.kvm \ + trace-events-all > qemu-kvm-log.stp + +%{__python3} scripts/tracetool.py --backend dtrace --format simpletrace-stap \ + --group=all --binary %{_libexecdir}/qemu-kvm --probe-prefix qemu.kvm \ + trace-events-all > qemu-kvm-simpletrace.stp + +cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm + +gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl +gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check + +%ifarch s390x + # Copy the built new images into place for "make check": + cp pc-bios/s390-ccw/s390-ccw.img pc-bios/s390-ccw/s390-netboot.img pc-bios/ +%endif + +%install +%define _udevdir %(pkg-config --variable=udevdir udev) +%define _udevrulesdir %{_udevdir}/rules.d + +install -D -p -m 0644 %{SOURCE4} $RPM_BUILD_ROOT%{_unitdir}/ksm.service +install -D -p -m 0644 %{SOURCE5} $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ksm +install -D -p -m 0755 ksmctl $RPM_BUILD_ROOT%{_libexecdir}/ksmctl + +install -D -p -m 0644 %{SOURCE7} $RPM_BUILD_ROOT%{_unitdir}/ksmtuned.service +install -D -p -m 0755 %{SOURCE8} $RPM_BUILD_ROOT%{_sbindir}/ksmtuned +install -D -p -m 0644 %{SOURCE9} $RPM_BUILD_ROOT%{_sysconfdir}/ksmtuned.conf +install -D -p -m 0644 %{SOURCE26} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/vhost.conf +%ifarch s390x + install -D -p -m 0644 %{SOURCE30} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/kvm.conf +%else +%ifarch %{ix86} x86_64 + install -D -p -m 0644 %{SOURCE31} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/kvm.conf +%else + install -D -p -m 0644 %{SOURCE27} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/kvm.conf +%endif +%endif + +mkdir -p $RPM_BUILD_ROOT%{_bindir}/ +mkdir -p $RPM_BUILD_ROOT%{_udevrulesdir}/ +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name} + +# Create new directories and put them all under tests-src +mkdir -p $RPM_BUILD_ROOT%{testsdir}/python +mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests +mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/acceptance +mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests +mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp + +install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir} +install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir} + +install -m 0644 scripts/dump-guest-memory.py \ + $RPM_BUILD_ROOT%{_datadir}/%{name} + +# Install avocado_qemu tests +cp -R tests/acceptance/* $RPM_BUILD_ROOT%{testsdir}/tests/acceptance/ + +# Install qemu.py and qmp/ scripts required to run avocado_qemu tests +cp -R python/qemu $RPM_BUILD_ROOT%{testsdir}/python +cp -R scripts/qmp/* $RPM_BUILD_ROOT%{testsdir}/scripts/qmp +install -p -m 0755 tests/Makefile.include $RPM_BUILD_ROOT%{testsdir}/tests/ + +# Install qemu-iotests +cp -R tests/qemu-iotests/* $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests/ +# Avoid ambiguous 'python' interpreter name +find $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests/* -maxdepth 1 -type f -exec sed -i -e '1 s+/usr/bin/env \(python\|python3\)+%{__python3}+' {} \; +find $RPM_BUILD_ROOT%{testsdir}/scripts/qmp/* -maxdepth 1 -type f -exec sed -i -e '1 s+/usr/bin/env \(python\|python3\)+%{__python3}+' {} \; +find $RPM_BUILD_ROOT%{testsdir}/scripts/qmp/* -maxdepth 1 -type f -exec sed -i -e '1 s+/usr/bin/\(python\|python3\)+%{__python3}+' {} \; + +install -p -m 0644 %{SOURCE36} $RPM_BUILD_ROOT%{testsdir}/README + +make DESTDIR=$RPM_BUILD_ROOT \ + sharedir="%{_datadir}/%{name}" \ + datadir="%{_datadir}/%{name}" \ + install + +mkdir -p $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset + +# Install qemu-guest-agent service and udev rules +install -m 0644 %{_sourcedir}/qemu-guest-agent.service %{buildroot}%{_unitdir} +install -m 0644 %{_sourcedir}/qemu-ga.sysconfig %{buildroot}%{_sysconfdir}/sysconfig/qemu-ga +install -m 0644 %{_sourcedir}/99-qemu-guest-agent.rules %{buildroot}%{_udevrulesdir} + +# - the fsfreeze hook script: +install -D --preserve-timestamps \ + scripts/qemu-guest-agent/fsfreeze-hook \ + $RPM_BUILD_ROOT%{_sysconfdir}/qemu-ga/fsfreeze-hook +# Workaround for the missing /etc/qemu-kvm/fsfreeze-hook +# Please, do not carry this over to RHEL-9 +mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/qemu-kvm/ +ln -s %{_sysconfdir}/qemu-ga/fsfreeze-hook \ + $RPM_BUILD_ROOT%{_sysconfdir}/qemu-kvm/fsfreeze-hook + +# - the directory for user scripts: +mkdir $RPM_BUILD_ROOT%{_sysconfdir}/qemu-ga/fsfreeze-hook.d + +# - and the fsfreeze script samples: +mkdir --parents $RPM_BUILD_ROOT%{_datadir}/%{name}/qemu-ga/fsfreeze-hook.d/ +install --preserve-timestamps --mode=0644 \ + scripts/qemu-guest-agent/fsfreeze-hook.d/*.sample \ + $RPM_BUILD_ROOT%{_datadir}/%{name}/qemu-ga/fsfreeze-hook.d/ + +# - Install dedicated log directory: +mkdir -p -v $RPM_BUILD_ROOT%{_localstatedir}/log/qemu-ga/ + +mkdir -p $RPM_BUILD_ROOT%{_bindir} +install -c -m 0755 qemu-ga ${RPM_BUILD_ROOT}%{_bindir}/qemu-ga + +mkdir -p $RPM_BUILD_ROOT%{_mandir}/man8 + +install -m 0755 qemu-kvm $RPM_BUILD_ROOT%{_libexecdir}/ +install -m 0644 qemu-kvm.stp $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/ +install -m 0644 qemu-kvm-log.stp $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/ +install -m 0644 qemu-kvm-simpletrace.stp $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/ + +rm $RPM_BUILD_ROOT/%{_datadir}/applications/qemu.desktop +rm $RPM_BUILD_ROOT%{_bindir}/qemu-system-%{kvm_target} +rm $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/qemu-system-%{kvm_target}.stp +rm $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/qemu-system-%{kvm_target}-simpletrace.stp +rm $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/qemu-system-%{kvm_target}-log.stp +rm $RPM_BUILD_ROOT%{_bindir}/elf2dmp + +# Install simpletrace +install -m 0755 scripts/simpletrace.py $RPM_BUILD_ROOT%{_datadir}/%{name}/simpletrace.py +# Avoid ambiguous 'python' interpreter name +sed -i -e '1 s/python/python3/' $RPM_BUILD_ROOT%{_datadir}/%{name}/simpletrace.py +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool +install -m 0644 -t $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool scripts/tracetool/*.py +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/backend +install -m 0644 -t $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/backend scripts/tracetool/backend/*.py +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/format +install -m 0644 -t $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/format scripts/tracetool/format/*.py + +mkdir -p $RPM_BUILD_ROOT%{qemudocdir} +install -p -m 0644 -t ${RPM_BUILD_ROOT}%{qemudocdir} Changelog README.rst README.systemtap COPYING COPYING.LIB LICENSE docs/interop/qmp-spec.txt +chmod -x ${RPM_BUILD_ROOT}%{_mandir}/man1/* +chmod -x ${RPM_BUILD_ROOT}%{_mandir}/man8/* + +install -D -p -m 0644 qemu.sasl $RPM_BUILD_ROOT%{_sysconfdir}/sasl2/%{name}.conf + +# Provided by package openbios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/openbios-ppc +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/openbios-sparc32 +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/openbios-sparc64 +# Provided by package SLOF +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/slof.bin + +# Remove unpackaged files. +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/palcode-clipper +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/petalogix*.dtb +rm -f ${RPM_BUILD_ROOT}%{_datadir}/%{name}/bamboo.dtb +rm -f ${RPM_BUILD_ROOT}%{_datadir}/%{name}/ppc_rom.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/s390-zipl.rom +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/u-boot.e500 +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/qemu_vga.ndrv +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/skiboot.lid + +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/s390-ccw.img +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/s390-netboot.img +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/hppa-firmware.img +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/canyonlands.dtb +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/u-boot-sam460-20100605.bin + +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/firmware +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/edk2-*.fd +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/edk2-licenses.txt + +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/opensbi-riscv32-virt-fw_jump.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/opensbi-riscv64-sifive_u-fw_jump.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/opensbi-riscv64-virt-fw_jump.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/qemu-nsis.bmp + +rm -rf ${RPM_BUILD_ROOT}%{_libdir}/qemu-kvm/ui-spice-app.so + +%ifarch s390x + # Use the s390-*.imgs that we've just built, not the pre-built ones + install -m 0644 pc-bios/s390-ccw/s390-ccw.img $RPM_BUILD_ROOT%{_datadir}/%{name}/ + install -m 0644 pc-bios/s390-ccw/s390-netboot.img $RPM_BUILD_ROOT%{_datadir}/%{name}/ +%endif + +%ifnarch x86_64 + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/kvmvapic.bin + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/linuxboot.bin + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/multiboot.bin + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/pvh.bin +%endif + +# Remove sparc files +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/QEMU,tcx.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/QEMU,cgthree.bin + +# Remove ivshmem example programs +rm -rf ${RPM_BUILD_ROOT}%{_bindir}/ivshmem-client +rm -rf ${RPM_BUILD_ROOT}%{_bindir}/ivshmem-server + +# Remove efi roms +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/efi*.rom + +# Provided by package ipxe +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/pxe*rom +# Provided by package vgabios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/vgabios*bin +# Provided by package seabios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/bios*.bin +# Provided by package sgabios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/sgabios.bin + +# the pxe gpxe images will be symlinks to the images on +# /usr/share/ipxe, as QEMU doesn't know how to look +# for other paths, yet. +pxe_link() { + ln -s ../ipxe.efi/$2.rom %{buildroot}%{_datadir}/%{name}/efi-$1.rom +} + +%ifnarch aarch64 s390x +pxe_link e1000 8086100e +pxe_link ne2k_pci 10ec8029 +pxe_link pcnet 10222000 +pxe_link rtl8139 10ec8139 +pxe_link virtio 1af41000 +pxe_link e1000e 808610d3 +%endif + +rom_link() { + ln -s $1 %{buildroot}%{_datadir}/%{name}/$2 +} + +%ifnarch aarch64 s390x + rom_link ../seavgabios/vgabios-isavga.bin vgabios.bin + rom_link ../seavgabios/vgabios-cirrus.bin vgabios-cirrus.bin + rom_link ../seavgabios/vgabios-qxl.bin vgabios-qxl.bin + rom_link ../seavgabios/vgabios-stdvga.bin vgabios-stdvga.bin + rom_link ../seavgabios/vgabios-vmware.bin vgabios-vmware.bin + rom_link ../seavgabios/vgabios-virtio.bin vgabios-virtio.bin + rom_link ../seavgabios/vgabios-ramfb.bin vgabios-ramfb.bin + rom_link ../seavgabios/vgabios-bochs-display.bin vgabios-bochs-display.bin +%endif +%ifarch x86_64 + rom_link ../seabios/bios.bin bios.bin + rom_link ../seabios/bios-256k.bin bios-256k.bin + rom_link ../sgabios/sgabios.bin sgabios.bin +%endif + +%if 0%{have_kvm_setup} + install -D -p -m 755 %{SOURCE21} $RPM_BUILD_ROOT%{_prefix}/lib/systemd/kvm-setup + install -D -p -m 644 %{SOURCE22} $RPM_BUILD_ROOT%{_unitdir}/kvm-setup.service + install -D -p -m 644 %{SOURCE23} $RPM_BUILD_ROOT%{_presetdir}/85-kvm.preset +%endif + +%if 0%{have_memlock_limits} + install -D -p -m 644 %{SOURCE28} $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/95-kvm-memlock.conf +%endif + +# Install rules to use the bridge helper with libvirt's virbr0 +install -D -m 0644 %{SOURCE12} $RPM_BUILD_ROOT%{_sysconfdir}/%{name}/bridge.conf + +# Install qemu-pr-helper service +install -m 0644 %{_sourcedir}/qemu-pr-helper.service %{buildroot}%{_unitdir} +install -m 0644 %{_sourcedir}/qemu-pr-helper.socket %{buildroot}%{_unitdir} + +find $RPM_BUILD_ROOT -name '*.la' -or -name '*.a' | xargs rm -f + +# We need to make the block device modules executable else +# RPM won't pick up their dependencies. +chmod +x $RPM_BUILD_ROOT%{_libdir}/qemu-kvm/block-*.so + +# Remove buildinfo +rm -rf $RPM_BUILD_ROOT%{qemudocdir}/interop/.buildinfo + +# Remove spec +rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs + +%check +export DIFF=diff; make check V=1 + +%post -n qemu-kvm-core +# load kvm modules now, so we can make sure no reboot is needed. +# If there's already a kvm module installed, we don't mess with it +%udev_rules_update +sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || : + udevadm trigger --subsystem-match=misc --sysname-match=kvm --action=add || : +%if %{have_kvm_setup} + systemctl daemon-reload # Make sure it sees the new presets and unitfile + %systemd_post kvm-setup.service + if systemctl is-enabled kvm-setup.service > /dev/null; then + systemctl start kvm-setup.service + fi +%endif + +%if %{have_kvm_setup} +%preun -n qemu-kvm-core +%systemd_preun kvm-setup.service +%endif + +%post -n qemu-kvm-common +%systemd_post ksm.service +%systemd_post ksmtuned.service + +getent group kvm >/dev/null || groupadd -g 36 -r kvm +getent group qemu >/dev/null || groupadd -g 107 -r qemu +getent passwd qemu >/dev/null || \ +useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ + -c "qemu user" qemu + +%preun -n qemu-kvm-common +%systemd_preun ksm.service +%systemd_preun ksmtuned.service + +%postun -n qemu-kvm-common +%systemd_postun_with_restart ksm.service +%systemd_postun_with_restart ksmtuned.service + +%post -n qemu-guest-agent +%systemd_post qemu-guest-agent.service +%preun -n qemu-guest-agent +%systemd_preun qemu-guest-agent.service +%postun -n qemu-guest-agent +%systemd_postun_with_restart qemu-guest-agent.service + +%files +# Deliberately empty + + +%files -n qemu-kvm-common +%defattr(-,root,root) +%dir %{qemudocdir} +%doc %{qemudocdir}/Changelog +%doc %{qemudocdir}/README.rst +%doc %{qemudocdir}/qemu-doc.html +%doc %{qemudocdir}/COPYING +%doc %{qemudocdir}/COPYING.LIB +%doc %{qemudocdir}/LICENSE +%doc %{qemudocdir}/README.systemtap +%doc %{qemudocdir}/qmp-spec.txt +%doc %{qemudocdir}/qemu-doc.txt +%doc %{qemudocdir}/qemu-ga-ref.html +%doc %{qemudocdir}/qemu-ga-ref.txt +%doc %{qemudocdir}/qemu-qmp-ref.html +%doc %{qemudocdir}/qemu-qmp-ref.txt +%doc %{qemudocdir}/interop/* +%{_mandir}/man7/qemu-qmp-ref.7* +%{_mandir}/man7/qemu-cpu-models.7* +%{_bindir}/qemu-keymap +%{_bindir}/qemu-pr-helper +%{_bindir}/qemu-edid +%{_bindir}/qemu-trace-stap +%{_unitdir}/qemu-pr-helper.service +%{_unitdir}/qemu-pr-helper.socket +%{_mandir}/man7/qemu-ga-ref.7* + +%dir %{_datadir}/%{name}/ +%{_datadir}/%{name}/keymaps/ +%{_mandir}/man1/%{name}.1* +%{_mandir}/man1/qemu-trace-stap.1* +%{_mandir}/man7/qemu-block-drivers.7* +%attr(4755, -, -) %{_libexecdir}/qemu-bridge-helper +%config(noreplace) %{_sysconfdir}/sasl2/%{name}.conf +%{_unitdir}/ksm.service +%{_libexecdir}/ksmctl +%config(noreplace) %{_sysconfdir}/sysconfig/ksm +%{_unitdir}/ksmtuned.service +%{_sbindir}/ksmtuned +%{_udevdir}/udev-kvm-check +%{_udevrulesdir}/81-kvm-rhel.rules +%ghost %{_sysconfdir}/kvm +%config(noreplace) %{_sysconfdir}/ksmtuned.conf +%dir %{_sysconfdir}/%{name} +%config(noreplace) %{_sysconfdir}/%{name}/bridge.conf +%config(noreplace) %{_sysconfdir}/modprobe.d/vhost.conf +%config(noreplace) %{_sysconfdir}/modprobe.d/kvm.conf +%{_datadir}/%{name}/simpletrace.py* +%{_datadir}/%{name}/tracetool/*.py* +%{_datadir}/%{name}/tracetool/backend/*.py* +%{_datadir}/%{name}/tracetool/format/*.py* + +%files -n qemu-kvm-core +%defattr(-,root,root) +%ifarch x86_64 + %{_datadir}/%{name}/bios.bin + %{_datadir}/%{name}/bios-256k.bin + %{_datadir}/%{name}/linuxboot.bin + %{_datadir}/%{name}/multiboot.bin + %{_datadir}/%{name}/kvmvapic.bin + %{_datadir}/%{name}/sgabios.bin + %{_datadir}/%{name}/pvh.bin +%endif +%ifarch s390x + %{_datadir}/%{name}/s390-ccw.img + %{_datadir}/%{name}/s390-netboot.img +%endif +%ifnarch aarch64 s390x + %{_datadir}/%{name}/vgabios.bin + %{_datadir}/%{name}/vgabios-cirrus.bin + %{_datadir}/%{name}/vgabios-qxl.bin + %{_datadir}/%{name}/vgabios-stdvga.bin + %{_datadir}/%{name}/vgabios-vmware.bin + %{_datadir}/%{name}/vgabios-virtio.bin + %{_datadir}/%{name}/vgabios-ramfb.bin + %{_datadir}/%{name}/vgabios-bochs-display.bin + %{_datadir}/%{name}/efi-e1000.rom + %{_datadir}/%{name}/efi-e1000e.rom + %{_datadir}/%{name}/efi-virtio.rom + %{_datadir}/%{name}/efi-pcnet.rom + %{_datadir}/%{name}/efi-rtl8139.rom + %{_datadir}/%{name}/efi-ne2k_pci.rom +%endif +%{_datadir}/icons/* +%{_datadir}/%{name}/linuxboot_dma.bin +%{_datadir}/%{name}/dump-guest-memory.py* +%{_libexecdir}/qemu-kvm +%{_datadir}/systemtap/tapset/qemu-kvm.stp +%{_datadir}/systemtap/tapset/qemu-kvm-log.stp +%{_datadir}/%{name}/trace-events-all +%{_datadir}/systemtap/tapset/qemu-kvm-simpletrace.stp +%{_datadir}/%{name}/systemtap/script.d/qemu_kvm.stp +%{_datadir}/%{name}/systemtap/conf.d/qemu_kvm.conf +%if 0%{have_kvm_setup} + %{_prefix}/lib/systemd/kvm-setup + %{_unitdir}/kvm-setup.service + %{_presetdir}/85-kvm.preset +%endif +%if 0%{have_memlock_limits} + %{_sysconfdir}/security/limits.d/95-kvm-memlock.conf +%endif +%{_libexecdir}/virtiofsd +%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json + +%files -n qemu-img +%defattr(-,root,root) +%{_bindir}/qemu-img +%{_bindir}/qemu-io +%{_bindir}/qemu-nbd +%{_mandir}/man1/qemu-img.1* +%{_mandir}/man8/qemu-nbd.8* + +%files -n qemu-guest-agent +%defattr(-,root,root,-) +%doc COPYING README.rst +%{_bindir}/qemu-ga +%{_mandir}/man8/qemu-ga.8* +%{_unitdir}/qemu-guest-agent.service +%{_udevrulesdir}/99-qemu-guest-agent.rules +%config(noreplace) %{_sysconfdir}/sysconfig/qemu-ga +%{_sysconfdir}/qemu-ga +%{_sysconfdir}/qemu-kvm/fsfreeze-hook +%{_datadir}/%{name}/qemu-ga +%dir %{_localstatedir}/log/qemu-ga + +%files tests +%{testsdir} + +%files block-curl +%{_libdir}/qemu-kvm/block-curl.so + +%if %{have_gluster} +%files block-gluster +%{_libdir}/qemu-kvm/block-gluster.so +%endif + +%files block-iscsi +%{_libdir}/qemu-kvm/block-iscsi.so + +%files block-rbd +%{_libdir}/qemu-kvm/block-rbd.so + +%files block-ssh +%{_libdir}/qemu-kvm/block-ssh.so + + +%changelog +* Tue Feb 08 2022 Jon Maloy - 4.2.0-59.el8_5.2 +- kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch [bz#2048627] +- Resolves: bz#2048627 + (CVE-2022-0358 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 [rhel-8.5.0.z]) + +* Thu Nov 25 2021 Jon Maloy - 4.2.0-59.el8_5 +- kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch [bz#2025605] +- kvm-e1000-fix-tx-re-entrancy-problem.patch [bz#2025011] +- Resolves: bz#2025605 + (CVE-2021-3930 virt:rhel/qemu-kvm: QEMU: off-by-one error in mode_sense_page() in hw/scsi/scsi-disk.c [rhel-8.5.0.z]) +- Resolves: bz#2025011 + (CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0.z]) + +* Fri Oct 01 2021 Jon Maloy - 4.2.0-59 +- kvm-scsi-make-io_timeout-configurable.patch [bz#1994041] +- Resolves: bz#1994041 + (qemu-kvm scsi: change default passthrough timeout to non-infinite) + +* Wed Aug 18 2021 Danilo Cesar Lemes de Paula - 4.2.0-58.el8 +- kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch [bz#1967496] +- kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch [bz#1967496] +- Resolves: bz#1967496 + ([virtio-fs] nfs/xfstest generic/089 generic/478 generic/632 failed) + +* Wed Aug 04 2021 Miroslav Rezanina - 4.2.0-57 +- kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch [bz#1969848] +- kvm-async-use-explicit-memory-barriers.patch [bz#1969848] +- Resolves: bz#1969848 + (qemu-img convert hangs on aarch64) + +* Thu Jul 29 2021 Miroslav Rezanina - 4.2.0-56 +- kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch [bz#1967716] +- kvm-qga-add-ssh-add-remove-authorized-keys.patch [bz#1967716] +- kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch [bz#1967716] +- kvm-qga-add-ssh-get-authorized-keys.patch [bz#1967716] +- kvm-Add-mtod_check.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-bootp-check-bootp_input-buffer-size.patch [bz#1970819] +- kvm-upd6-check-udp6_input-buffer-size.patch [bz#1970835] +- kvm-tftp-check-tftp_input-buffer-size.patch [bz#1970843] +- kvm-tftp-introduce-a-header-structure.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-udp-check-upd_input-buffer-size.patch [bz#1970853] +- kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch [bz#1982134] +- kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch [bz#1982134] +- Resolves: bz#1967716 + (RFE: rebuild guest agent to include public ssh injection api support) +- Resolves: bz#1970819 + (CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8]) +- Resolves: bz#1970835 + (CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8]) +- Resolves: bz#1970843 + (CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8]) +- Resolves: bz#1970853 + (CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8]) +- Resolves: bz#1982134 + (QEMU core dump while booting guest with a non-exist fd on tap) + +* Fri Jul 23 2021 Danilo Cesar Lemes de Paula - 4.2.0-55.el8 +- kvm-net-introduce-qemu_receive_packet.patch [bz#1932917] +- kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch [bz#1932917] +- kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch [bz#1932917] +- kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch [bz#1932917] +- kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch [bz#1932917] +- kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch [bz#1932917] +- kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch [bz#1932917] +- kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch [bz#1932917] +- kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch [bz#1932917] +- Resolves: bz#1932917 + (CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow) + +* Thu Jul 22 2021 Danilo Cesar Lemes de Paula - 4.2.0-54.el8 +- kvm-redhat-Fix-unversioned-Obsoletes-warning.patch [bz#1967329] +- Resolves: bz#1967329 + (Make qemu-kvm use versioned obsoletes for qemu-kvm-ma and qemu-kvm-rhev) + +* Fri Jul 02 2021 Danilo Cesar Lemes de Paula - 4.2.0-53.el8 +- kvm-virtiofsd-Whitelist-fchmod.patch [bz#1967914] +- kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch [bz#1957866] +- kvm-Compress-lines-for-immediate-return.patch [bz#1970912] +- kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch [bz#1970912] +- Resolves: bz#1967914 + ([virtio-fs] virtiofsd quit when coping file to a folder in virtio-fs mounted volume(windows guest)) +- Resolves: bz#1957866 + (RHEL8.4 - EEH capability disabled on KVM guest and recovery of PCI passthru device fails(CX5 / mlx5_core) (qemu-kvm)) +- Resolves: bz#1970912 + (Deployment fails with "Invalid or missing agent token received") + +* Fri Jun 11 2021 Danilo Cesar Lemes de Paula - 4.2.0-52.el8 +- kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch [bz#1877163] +- kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch [bz#1944861] +- kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch [bz#1969768] +- kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch [bz#1969768] +- Resolves: bz#1877163 + ([FJ 8.3 Bug] The progress bar of the "virt-clone --nonsparse" command shows the progress rate exceeding 100%.) +- Resolves: bz#1944861 + (Qemu-img convert fails when source image is on gpfs) +- Resolves: bz#1969768 + ([ppc64le] Hotplug vcpu device hit call trace:[qemu output] KVM: unknown exit, hardware reason 7fff9ce87ed8) + +* Tue May 25 2021 Danilo Cesar Lemes de Paula - 4.2.0-51.el8 +- kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch [bz#1940450] +- kvm-vfio-ccw-Connect-the-device-request-notifier.patch [bz#1940450] +- kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch [bz#1942880] +- kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch [bz#1942880] +- kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch [bz#1942880] +- Resolves: bz#1940450 + (RHEL8.5 - Mediated Device already in use by same domain we are booting (vfio-ccw/Multipath Testing) (kvm) - qemu-kvm part (also has kernel and libvirt parts)) +- Resolves: bz#1942880 + (RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm)) + +* Wed May 05 2021 Danilo Cesar Lemes de Paula - 4.2.0-50.el8 +- kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch [bz#1925430] +- kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch [bz#1842478] +- kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch [bz#1842478] +- kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch [bz#1842478] +- kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch [bz#1842478] +- kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch [bz#1842478] +- kvm-softmmu-memory-Log-invalid-memory-accesses.patch [bz#1842478] +- Resolves: bz#1925430 + (CVE-2021-20221 virt:rhel/qemu-kvm: qemu: out-of-bound heap buffer access via an interrupt ID field [rhel-8.5.0]) +- Resolves: bz#1842478 + (CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0]) + +* Wed Apr 28 2021 Danilo Cesar Lemes de Paula - 4.2.0-49.el8 +- kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch [bz#1892350] +- kvm-e1000-fail-early-for-evil-descriptor.patch [bz#1930092] +- kvm-net-forbid-the-reentrant-RX.patch [bz#1859175] +- kvm-qemu-img-convert-Don-t-pre-zero-images.patch [bz#1855250] +- kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch [bz#1932823] +- Resolves: bz#1892350 + (CVE-2020-27617 virt:rhel/qemu-kvm: QEMU: net: an assert failure via eth_get_gso_type [rhel-8.5.0]) +- Resolves: bz#1930092 + (CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0]) +- Resolves: bz#1859175 + (CVE-2020-15859 virt:rhel/qemu-kvm: QEMU: net: e1000e: use-after-free while sending packets [rhel-8]) +- Resolves: bz#1855250 + (qemu-img convert uses possibly slow pre-zeroing on block storage) +- Resolves: bz#1932823 + (after upgrade from 4.3 to 4.4 audio stops working in guests after couple of seconds) + +* Tue Mar 16 2021 Danilo Cesar Lemes de Paula - 4.2.0-48.el8 +- kvm-ide-atapi-check-logical-block-address-and-read-size-.patch [bz#1917451] +- Resolves: bz#1917451 + (CVE-2020-29443 virt:rhel/qemu-kvm: QEMU: ide: atapi: OOB access while processing read commands [rhel-8.4.0]) + +* Mon Mar 08 2021 Danilo Cesar Lemes de Paula - 4.2.0-47.el8 +- kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch [bz#1790620] +- kvm-target-i386-add-fast-short-REP-MOV-support.patch [bz#1790620] +- kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch [bz#1790620] +- kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch [bz#1790620] +- Resolves: bz#1790620 + ([RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train) + +* Wed Mar 03 2021 Danilo Cesar Lemes de Paula - 4.2.0-46.el8 +- kvm-redhat-makes-qemu-respect-system-s-crypto-profile.patch [bz#1902960] +- kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch [bz#1912891] +- Resolves: bz#1902960 + (QEMU doesn't honour system crypto policies) +- Resolves: bz#1912891 + ([ppc64le] --disk cdimage.iso,bus=usb fails to boot) + +* Wed Feb 10 2021 Jon Maloy - 4.2.0-45.el8 +- kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch [bz#1919111] +- kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch [bz#1919111] +- kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch [bz#1919111] +- Resolves: bz#1919111 + (CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0]) + +* Tue Feb 02 2021 Jon Maloy - 4.2.0-44.el8 +- kvm-spapr-Improve-handling-of-fdt-buffer-size.patch [bz#1901837] +- kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch [bz#1901837] +- kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch [bz#1901837] +- kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch [bz#1901837] +- kvm-spapr-Allow-memory-unplug-to-always-succeed.patch [bz#1901837] +- kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch [bz#1901837] +- kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch [bz#1834281] +- kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch [bz#1834281] +- kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch [bz#1912974] +- Resolves: bz#1834281 + (qemu-img convert abort when converting image with unaligned size) +- Resolves: bz#1901837 + (Failed to hotunplug pc-dimm device) +- Resolves: bz#1912974 + (CVE-2020-11947 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in iscsi_aio_ioctl_cb() in block/iscsi.c may lead to information disclosure [rhel-8]) + +* Wed Jan 27 2021 Danilo Cesar Lemes de Paula - 4.2.0-43.el8 +- kvm-Drop-bogus-IPv6-messages.patch [bz#1918054] +- Resolves: bz#1918054 + (CVE-2020-10756 virt:rhel/qemu-kvm: QEMU: slirp: networking out-of-bounds read information disclosure vulnerability [rhel-8.4.0]) + +* Thu Jan 21 2021 Danilo Cesar Lemes de Paula - 4.2.0-42.el8 +- kvm-linux-headers-add-vfio-DMA-available-capability.patch [bz#1905391] +- kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch [bz#1905391] +- kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch [bz#1905391] +- kvm-vfio-Find-DMA-available-capability.patch [bz#1905391] +- kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch [bz#1905391] +- kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch [bz#1905391] +- kvm-s390x-fix-build-for-without-default-devices.patch [bz#1905391] +- Resolves: bz#1905391 + (RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm)) + +* Mon Jan 18 2021 Danilo Cesar Lemes de Paula - 4.2.0-41.el8 +- kvm-udev-kvm-check-remove-the-exceeded-subscription-limi.patch [bz#1909244] +- kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch [bz#1843852] +- kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch [bz#1843852] +- kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch [bz#1843852] +- kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch [bz#1843852] +- kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch [bz#1843852] +- kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch [bz#1843852] +- kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch [bz#1843852] +- kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch [bz#1843852] +- kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch [bz#1843852] +- kvm-memory-Add-IOMMUTLBEvent.patch [bz#1843852] +- kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch [bz#1843852] +- kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch [bz#1843852] +- kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch [bz#1843852] +- kvm-memory-clamp-cached-translation-in-case-it-points-to.patch [bz#1904393] +- kvm-hw-ehci-check-return-value-of-usb_packet_map.patch [bz#1898628] +- kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch [bz#1903070] +- Resolves: bz#1909244 + (Remove KVM guest count and limit info message) +- Resolves: bz#1843852 + (qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed.) +- Resolves: bz#1904393 + (CVE-2020-27821 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in msix_table_mmio_write() in hw/pci/msix.c [rhel-8]) +- Resolves: bz#1898628 + (CVE-2020-25723 virt:rhel/qemu-kvm: QEMU: assertion failure through usb_packet_unmap() in hw/usb/hcd-ehci.c [rhel-8]) +- Resolves: bz#1903070 + (CVE-2020-25707 CVE-2020-28916 virt:rhel/qemu-kvm: various flaws [rhel-8]) + +* Mon Jan 04 2021 Danilo Cesar Lemes de Paula - 4.2.0-40.el8 +- kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch [bz#1910267] +- kvm-qga-rename-Error-parameter-to-more-common-errp.patch [bz#1910326] +- kvm-util-Introduce-qemu_get_host_name.patch [bz#1910326] +- kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch [bz#1910326] +- kvm-redhat-add-un-pre-install-systemd-hooks-for-qemu-ga.patch [bz#1910220] +- Resolves: bz#1910267 + (There is no soft link '/etc/qemu-kvm/fsfreeze-hook') +- Resolves: bz#1910326 + (Incorrect hostname returned by qga command 'guest-get-host-name') +- Resolves: bz#1910220 + (qemu-ga service still active and can work after qemu-guest-agent been removed) + +* Wed Dec 23 2020 Danilo Cesar Lemes de Paula - 4.2.0-39.el8 +- kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch [bz#1901837] +- kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch [bz#1901837] +- kvm-slirp-check-pkt_len-before-reading-protocol-header.patch [bz#1902237] +- kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch [bz#1905386] +- kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch [bz#1859494] +- kvm-error-Fix-examples-in-error.h-s-big-comment.patch [bz#1859494] +- kvm-error-Improve-error.h-s-big-comment.patch [bz#1859494] +- kvm-error-Document-Error-API-usage-rules.patch [bz#1859494] +- kvm-error-New-macro-ERRP_GUARD.patch [bz#1859494] +- kvm-qga-add-command-guest-get-disks.patch [bz#1859494] +- kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch [bz#1859494] +- kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch [bz#1859494] +- kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch [bz#1859494] +- kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch [bz#1859494] +- Resolves: bz#1859494 + (Report logical_name for disks without mounted file-system) +- Resolves: bz#1901837 + (Failed to hotunplug pc-dimm device) +- Resolves: bz#1902237 + (CVE-2020-29129 CVE-2020-29130 virt:rhel/qemu-kvm: QEMU: slirp: out-of-bounds access while processing ARP/NCSI packets [rhel-8]) +- Resolves: bz#1905386 + (RHEL8.3 - s390x/s390-virtio-ccw: Reset PCI devices during subsystem reset (qemu-kvm)) + +* Fri Dec 11 2020 Danilo Cesar Lemes de Paula - 4.2.0-38.el8 +- kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch [bz#1880546] +- kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch [bz#1903135] +- kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch [bz#1903135] +- kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch [bz#1903135] +- kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch [bz#1903135] +- Resolves: bz#1880546 + (qemu use SCMP_ACT_TRAP even SCMP_ACT_KILL_PROCESS is available) +- Resolves: bz#1903135 + (RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm)) + +* Mon Nov 23 2020 Danilo Cesar Lemes de Paula - 4.2.0-37.el8 +- kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch [bz#1860994] +- Resolves: bz#1860994 + (CVE-2020-16092 virt:rhel/qemu-kvm: QEMU: reachable assertion failure in net_tx_pkt_add_raw_fragment() in hw/net/net_tx_pkt.c [rhel-8]) + +* Fri Nov 20 2020 Danilo Cesar Lemes de Paula - 4.2.0-36.el8 +- kvm-qga-fix-assert-regression-on-guest-shutdown.patch [bz#1884531] +- kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch [bz#1857733] +- kvm-virtio-add-vhost-user-fs-ccw-device.patch [bz#1857733] +- kvm-Ensure-vhost-user-fs-is-enabled-on-s390x.patch [bz#1857733] +- kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch [bz#1798506] +- kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch [bz#1798506] +- kvm-s390-sclp-rework-sclp-boundary-checks.patch [bz#1798506] +- kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch [bz#1798506] +- kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch [bz#1798506] +- kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch [bz#1798506] +- kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch [bz#1798506] +- kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch [bz#1798506] +- kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch [bz#1798506] +- kvm-s390-guest-support-for-diagnose-0x318.patch [bz#1798506] +- kvm-s390x-pv-Remove-sclp-boundary-checks.patch [bz#1798506] +- kvm-s390x-pv-Fix-diag318-PV-fencing.patch [bz#1798506] +- kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch [bz#1659412] +- kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch [bz#1898700] +- Resolves: bz#1659412 + ([IBM 8.4 FEAT] KVM enablement for enhanced hardware diagnose data of guest kernel on s390x - qemu part) +- Resolves: bz#1798506 + ([IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part) +- Resolves: bz#1857733 + ([IBM 8.4 FEAT] KVM: Add support for virtio-fs on s390x - qemu part) +- Resolves: bz#1884531 + (qemu-ga aborts after guest-shutdown command) +- Resolves: bz#1898700 + (qemu-kvm for RHEL-8.4 doesn't build due to a possible incompatibility with systemtap-sdt-devel-4.4-1) + +* Mon Oct 19 2020 Danilo Cesar Lemes de Paula - 4.2.0-35.el8 +- kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch [bz#1755075] +- kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch [bz#1755075] +- kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch [bz#1755075] +- kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch [bz#1874780] +- kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch [bz#1846975] +- kvm-aio-posix-completely-stop-polling-when-disabled.patch [bz#1846975] +- kvm-Remove-explicit-glusterfs-api-dependency.patch [bz#1872854] +- Resolves: bz#1755075 + ([qemu-guest-agent] fsinfo doesn't return disk info on s390x) +- Resolves: bz#1846975 + (Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous) +- Resolves: bz#1872854 + (move the glusterfs dependency out of qemu-kvm-core to the glusterfs module) +- Resolves: bz#1874780 + (-prom-env does not validate input) + +* Tue Sep 08 2020 Danilo Cesar Lemes de Paula - 4.2.0-34.el8 +- kvm-usb-fix-setup_len-init-CVE-2020-14364.patch [bz#1869710] +- Resolves: bz#1869710 + (CVE-2020-14364 qemu-kvm: QEMU: usb: out-of-bounds r/w access issue while processing usb packets [rhel-8.3.0]) + +* Wed Aug 19 2020 Danilo Cesar Lemes de Paula - 4.2.0-33.el8 +- kvm-Require-libfdt-1.6.0.patch [bz#1867847] +- Resolves: bz#1867847 + ([ppc] virt module 7629: /usr/libexec/qemu-kvm: undefined symbol: fdt_check_full, version LIBFDT_1.2) + +* Mon Aug 10 2020 Danilo Cesar Lemes de Paula - 4.2.0-32.el8 +- kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch [bz#1780385] +- kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch [bz#1689341] +- kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch [bz#1689341] +- kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch [bz#1863034] +- Resolves: bz#1689341 + (QEMU should report an error and return failure if AMD SEV is not enabled in the kernel) +- Resolves: bz#1780385 + ([RFE] AMD EPYC-Rome support for KVM / QEMU guest) +- Resolves: bz#1863034 + (RHEL8.3 Beta - Secure Execution: Unable to start Qemu with "-no-reboot" option (qemu-kvm)) + +* Wed Jul 22 2020 Danilo Cesar Lemes de Paula - 4.2.0-31.el8 +- kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch [bz#1807057] +- kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch [bz#1807057] +- kvm-iotests-026-Test-EIO-on-allocation-in-a-data-file.patch [bz#1807057] +- kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch [bz#1807057] +- Resolves: bz#1807057 + (qcow2_alloc_cluster_abort() frees preallocated zero clusters) + +* Tue Jul 07 2020 Danilo Cesar Lemes de Paula - 4.2.0-30.el8 +- kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch [bz#1835390] +- kvm-s390x-sigp-Fix-sense-running-reporting.patch [bz#1854092] +- kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch [bz#1854092] +- kvm-virtio-net-fix-removal-of-failover-device.patch [] +- Resolves: bz#1835390 + (qemu promote host does not support 'EDX.npt' and 'EDX.nrip-save' when test with Q35 machine type on EPYC host) +- Resolves: bz#1854092 + (kvm-unit-tests: tcg smp FAIL) + +* Sun Jun 28 2020 Danilo Cesar Lemes de Paula - 4.2.0-29.el8 +- kvm-vfio-ccw-Fix-error-message.patch [bz#1660916] +- kvm-vfio-ccw-allow-non-prefetch-ORBs.patch [bz#1660916] +- kvm-linux-headers-support-vfio-ccw-features.patch [bz#1660916] +- kvm-vfio-ccw-Refactor-cleanup-of-regions.patch [bz#1660916] +- kvm-vfio-ccw-Add-support-for-the-schib-region.patch [bz#1660916] +- kvm-vfio-ccw-Refactor-ccw-irq-handler.patch [bz#1660916] +- kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch [bz#1660916] +- kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch [bz#1660916] +- kvm-config-enable-VFIO_CCW.patch [bz#1660916] +- kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch [] +- kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch [] +- kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch [bz#1838070] +- Resolves: bz#1660916 + ([IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part) +- Resolves: bz#1838070 + (CVE-2020-1983 virt:rhel/qemu-kvm: QEMU: slirp: use-after-free in ip_reass() function in ip_input.c [rhel-8]) + +* Fri Jun 19 2020 Danilo Cesar Lemes de Paula - 4.2.0-28.el8 +- kvm-redhat-Install-the-s390-netboot.img-that-we-ve-built.patch [bz#1828317] +- kvm-linux-headers-update-kvm.h.patch [bz#1828317] +- kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch [bz#1828317] +- kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch [bz#1828317] +- kvm-s390x-Move-initial-reset.patch [bz#1828317] +- kvm-s390x-Move-clear-reset.patch [bz#1828317] +- kvm-s390x-Beautify-diag308-handling.patch [bz#1828317] +- kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch [bz#1828317] +- kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch [bz#1828317] +- kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch [bz#1828317] +- kvm-pc-bios-s390x-Fix-reset-psw-mask.patch [bz#1828317] +- kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch [bz#1828317] +- kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch [bz#1828317] +- kvm-s390x-Add-missing-vcpu-reset-functions.patch [bz#1828317] +- kvm-s390-sclp-improve-special-wait-psw-logic.patch [bz#1828317] +- kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch [bz#1828317] +- kvm-s390-ipl-sync-back-loadparm.patch [bz#1828317] +- kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch [bz#1828317] +- kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch [bz#1828317] +- kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch [bz#1828317] +- kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch [bz#1828317] +- kvm-s390x-protvirt-Support-unpack-facility.patch [bz#1828317] +- kvm-s390x-protvirt-Add-migration-blocker.patch [bz#1828317] +- kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch [bz#1828317] +- kvm-s390x-protvirt-KVM-intercept-changes.patch [bz#1828317] +- kvm-s390x-Add-SIDA-memory-ops.patch [bz#1828317] +- kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch [bz#1828317] +- kvm-s390x-protvirt-SCLP-interpretation.patch [bz#1828317] +- kvm-s390x-protvirt-Set-guest-IPL-PSW.patch [bz#1828317] +- kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch [bz#1828317] +- kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch [bz#1828317] +- kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch [bz#1828317] +- kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch [bz#1828317] +- kvm-s390x-Add-unpack-facility-feature-to-GA1.patch [bz#1828317] +- kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch [bz#1828317] +- kvm-s390x-pv-Retry-ioctls-on-EINTR.patch [bz#1828317] +- kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch [bz#1828317] +- kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch [bz#1828317] +- kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch [bz#1756946] +- kvm-introduce-kvm_kernel_irqchip_-functions.patch [bz#1756946] +- kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch [bz#1756946] +- kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch [bz#1823275] +- Resolves: bz#1756946 + ([zKVM] Re-enable KVM_CAP_S390_AIS for new machine types) +- Resolves: bz#1823275 + (RHEL8.1 - GPU Numa nodes not visible in guest post the pass-through.) +- Resolves: bz#1828317 + ([IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part) + +* Fri Jun 19 2020 Danilo C. L. de Paula - 4.2.0 +- Resolves: bz#1810193 +(Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + +* Tue Jun 09 2020 Danilo C. L. de Paula - 4.2.0-25 +- Resolves: bz#1810193 + (Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + Another sync + +* Thu Jun 04 2020 Danilo C. L. de Paula - 4.2.0-23.el8 +- Resolves: bz#1810193 + (Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + Another syncronization + +* Mon Apr 27 2020 Danilo C. L. de Paula - 4.2.0 +- Resolves: bz#1810193 + (Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + +* Fri Feb 21 2020 Danilo Cesar Lemes de Paula - 2.12.0-99.el8 +- kvm-slirp-disable-tcp_emu.patch [bz#1791677] +- kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch [bz#1790308] +- Resolves: bz#1790308 + (qemu-kvm core dump when do L1 guest live migration with L2 guest running) +- Resolves: bz#1791677 + (QEMU: Slirp: disable emulation of tcp programs like ftp IRC etc. [rhel-8]) + +* Mon Feb 10 2020 Danilo Cesar Lemes de Paula - 2.12.0-98.el8 +- kvm-iscsi-Avoid-potential-for-get_status-overflow.patch [bz#1794501] +- kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch [bz#1794501] +- kvm-clean-up-callback-when-del-virtqueue.patch [bz#1708480] +- kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch [bz#1708480] +- kvm-virtio-reset-region-cache-when-on-queue-deletion.patch [bz#1708480] +- kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch [bz#1708480] +- Resolves: bz#1708480 + ([Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device) +- Resolves: bz#1794501 + (CVE-2020-1711 qemu-kvm: QEMU: block: iscsi: OOB heap access via an unexpected response of iSCSI Server [rhel-8.2.0]) + +* Fri Jan 24 2020 Miroslav Rezanina - 2.12.0-97.el8 +- kvm-exec-Fix-MAP_RAM-for-cached-access.patch [bz#1769613] +- kvm-virtio-Return-true-from-virtio_queue_empty-if-broken.patch [bz#1769613] +- kvm-usbredir-Prevent-recursion-in-usbredir_write.patch [bz#1752320] +- kvm-xhci-recheck-slot-status.patch [bz#1752320] +- kvm-tcp_emu-Fix-oob-access.patch [bz#1791566] +- kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch [bz#1791566] +- kvm-slirp-use-correct-size-while-emulating-commands.patch [bz#1791566] +- Resolves: bz#1752320 + (vm gets stuck when migrate vm back and forth with remote-viewer trying to connect) +- Resolves: bz#1769613 + ([SEV] kexec mays hang at "[sda] Synchronizing SCSI cache " before switching to new kernel) +- Resolves: bz#1791566 + (CVE-2020-7039 virt:rhel/qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-8.2.0]) + +* Tue Jan 07 2020 Danilo Cesar Lemes de Paula - 2.12.0-96.el8 +- kvm-i386-Remove-cpu64-rhel6-CPU-model.patch [bz#1741346] +- Resolves: bz#1741346 + (Remove the "cpu64-rhel6" CPU from qemu-kvm) + +* Thu Jan 02 2020 Danilo Cesar Lemes de Paula - 2.12.0-95.el8 +- kvm-virtio-gpu-block-both-2d-and-3d-rendering.patch [bz#1674324] +- kvm-x86-Intel-AVX512_BF16-feature-enabling.patch [bz#1642541] +- Resolves: bz#1642541 + ([Intel 8.2 Feature] qemu-kvm Enable BFloat16 data type support) +- Resolves: bz#1674324 + (With , qemu either refuses to start completely or spice-server crashes afterwards) + +* Wed Dec 18 2019 Danilo Cesar Lemes de Paula - 2.12.0-94.el8 +- kvm-util-mmap-alloc-Add-a-is_pmem-parameter-to-qemu_ram_.patch [bz#1539282] +- kvm-mmap-alloc-unfold-qemu_ram_mmap.patch [bz#1539282] +- kvm-mmap-alloc-fix-hugetlbfs-misaligned-length-in-ppc64.patch [bz#1539282] +- kvm-util-mmap-alloc-support-MAP_SYNC-in-qemu_ram_mmap.patch [bz#1539282] +- kvm-x86-cpu-Enable-MOVDIRI-cpu-feature.patch [bz#1634827] +- kvm-x86-cpu-Enable-MOVDIR64B-cpu-feature.patch [bz#1634827] +- kvm-add-call-to-qemu_add_opts-for-overcommit-option.patch [bz#1634827] +- kvm-support-overcommit-cpu-pm-on-off.patch [bz#1634827] +- kvm-i386-cpu-make-cpu-host-support-monitor-mwait.patch [] +- kvm-x86-cpu-Add-support-for-UMONITOR-UMWAIT-TPAUSE.patch [bz#1634827] +- kvm-target-i386-Add-support-for-save-load-IA32_UMWAIT_CO.patch [bz#1634827] +- Resolves: bz#1539282 + ([Intel 8.2 Feature][Crystal Ridge] Support MAP_SYNC - qemu-kvm) +- Resolves: bz#1634827 + ([Intel 8.2 Feat] KVM Enable SnowRidge Accelerator Interface Architecture (AIA) - qemu) + +* Wed Dec 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-93.el8 +- kvm-target-i386-Export-TAA_NO-bit-to-guests.patch [bz#1771971] +- kvm-target-i386-add-support-for-MSR_IA32_TSX_CTRL.patch [bz#1771971] +- Resolves: bz#1771971 + (CVE-2019-11135 virt:rhel/qemu-kvm: hw: TSX Transaction Asynchronous Abort (TAA) [rhel-8.2.0]) + +* Mon Dec 02 2019 Danilo Cesar Lemes de Paula - 2.12.0-92.el8 +- kvm-x86-cpu-use-FeatureWordArray-to-define-filtered_feat.patch [bz#1689270] +- kvm-i386-Add-x-force-features-option-for-testing.patch [bz#1689270] +- kvm-target-i386-define-a-new-MSR-based-feature-word-FEAT.patch [bz#1689270] +- kvm-i386-display-known-CPUID-features-linewrapped-in-alp.patch [bz#1689270] +- kvm-target-i386-kvm-kvm_get_supported_msrs-cleanup.patch [bz#1689270] +- kvm-target-i386-handle-filtered_features-in-a-new-functi.patch [bz#1689270] +- kvm-target-i386-introduce-generic-feature-dependency-mec.patch [bz#1689270] +- kvm-target-i386-expand-feature-words-to-64-bits.patch [bz#1689270] +- kvm-target-i386-add-VMX-definitions.patch [bz#1689270] +- kvm-vmxcap-correct-the-name-of-the-variables.patch [bz#1689270] +- kvm-target-i386-add-VMX-features.patch [bz#1689270] +- kvm-target-i386-work-around-KVM_GET_MSRS-bug-for-seconda.patch [bz#1689270] +- kvm-target-i386-adjust-for-missing-VMX-features.patch [bz#1689270] +- kvm-target-i386-add-VMX-features-to-named-CPU-models.patch [bz#1689270] +- kvm-target-i386-add-VMX-features-to-named-CPU-models-RHE.patch [bz#1689270] +- kvm-vhost-fix-vhost_log-size-overflow-during-migration.patch [bz#1776808] +- Resolves: bz#1689270 + (Nested KVM: limit VMX features according to CPU models - Slow Train) +- Resolves: bz#1776808 + (qemu-kvm crashes when Windows VM is migrated with multiqueue) + +* Wed Nov 27 2019 Danilo Cesar Lemes de Paula - 2.12.0-91.el8 +- kvm-qapi-fill-in-CpuInfoFast.arch-in-query-cpus-fast.patch [bz#1730969] +- kvm-curl-Keep-pointer-to-the-CURLState-in-CURLSocket.patch [bz#1744602] +- kvm-curl-Keep-socket-until-the-end-of-curl_sock_cb.patch [bz#1744602] +- kvm-curl-Check-completion-in-curl_multi_do.patch [bz#1744602] +- kvm-curl-Pass-CURLSocket-to-curl_multi_do.patch [bz#1744602] +- kvm-curl-Report-only-ready-sockets.patch [bz#1744602] +- kvm-curl-Handle-success-in-multi_check_completion.patch [bz#1744602] +- kvm-curl-Check-curl_multi_add_handle-s-return-code.patch [bz#1744602] +- Resolves: bz#1730969 + ([ppc] qmp: The 'arch' value returned by the command 'query-cpus-fast' does not match) +- Resolves: bz#1744602 + (qemu-img gets stuck when stream-converting from http) + +* Tue Nov 12 2019 Danilo Cesar Lemes de Paula - 2.12.0-90.el8 +- kvm-i386-Don-t-print-warning-if-phys-bits-was-set-automa.patch [bz#1719127] +- kvm-Disable-CONFIG_I2C-and-CONFIG_IOH3420.patch [bz#1693140] +- kvm-usb-drop-unnecessary-usb_device_post_load-checks.patch [bz#1757482] +- kvm-pc-bios-s390-ccw-define-loadparm-length.patch [bz#1664376] +- kvm-pc-bios-s390-ccw-net-Use-diag308-to-reset-machine-be.patch [bz#1664376] +- kvm-s390-bios-decouple-cio-setup-from-virtio.patch [bz#1664376] +- kvm-s390-bios-decouple-common-boot-logic-from-virtio.patch [bz#1664376] +- kvm-s390-bios-Clean-up-cio.h.patch [bz#1664376] +- kvm-s390-bios-Decouple-channel-i-o-logic-from-virtio.patch [bz#1664376] +- kvm-s390-bios-Map-low-core-memory.patch [bz#1664376] +- kvm-s390-bios-ptr2u32-and-u32toptr.patch [bz#1664376] +- kvm-s390-bios-Support-for-running-format-0-1-channel-pro.patch [bz#1664376] +- kvm-s390-bios-cio-error-handling.patch [bz#1664376] +- kvm-s390-bios-Extend-find_dev-for-non-virtio-devices.patch [bz#1664376] +- kvm-s390-bios-Factor-finding-boot-device-out-of-virtio-c.patch [bz#1664376] +- kvm-s390-bios-Refactor-virtio-to-run-channel-programs-vi.patch [bz#1664376] +- kvm-s390-bios-Use-control-unit-type-to-determine-boot-me.patch [bz#1664376] +- kvm-s390-bios-Add-channel-command-codes-structs-needed-f.patch [bz#1664376] +- kvm-s390-bios-Support-booting-from-real-dasd-device.patch [bz#1664376] +- kvm-s390-bios-Use-control-unit-type-to-find-bootable-dev.patch [bz#1664376] +- kvm-s390x-vfio-ap-Implement-hot-plug-unplug-of-vfio-ap-d.patch [bz#1660906] +- Resolves: bz#1660906 + ([IBM 8.2 FEAT] KVM s390x: Crypto Passthrough Hotplug - qemu part) +- Resolves: bz#1664376 + ([IBM 8.2 FEAT] CCW IPL Support (kvm) - qemu part) +- Resolves: bz#1693140 + (aarch64: qemu: remove smbus_eeprom and i2c from config) +- Resolves: bz#1719127 + ([Intel 8.2 Bug] warning shown when boot VM with “–cpu host” or “–cpu other mode” on ICX platform (physical)) +- Resolves: bz#1757482 + (Fail to migrate a rhel6.10-mt7.6 guest with dimm device) + +* Mon Oct 14 2019 Danilo Cesar Lemes de Paula - 2.12.0-89.el8 +- kvm-accel-use-g_strsplit-for-parsing-accelerator-names.patch [bz#1749022] +- kvm-opts-don-t-silently-truncate-long-parameter-keys.patch [bz#1749022] +- kvm-opts-don-t-silently-truncate-long-option-values.patch [bz#1749022] +- kvm-i386-fix-regression-parsing-multiboot-initrd-modules.patch [bz#1749022] +- kvm-i386-only-parse-the-initrd_filename-once-for-multibo.patch [bz#1749022] +- kvm-opts-remove-redundant-check-for-NULL-parameter.patch [bz#1749022] +- kvm-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch [bz#1749724] +- kvm-virtio-blk-Cancel-the-pending-BH-when-the-dataplane-.patch [bz#1708459] +- kvm-s390x-cpumodel-Rework-CPU-feature-definition.patch [bz#1660909] +- kvm-s390x-cpumodel-Set-up-CPU-model-for-AQIC-interceptio.patch [bz#1660909] +- kvm-ccid-Fix-dwProtocols-advertisement-of-T-0.patch [bz#1746361] +- kvm-s390-PCI-fix-IOMMU-region-init.patch [bz#1754643] +- kvm-fw_cfg-Improve-error-message-when-can-t-load-splash-.patch [bz#1607367] +- kvm-fw_cfg-Fix-boot-bootsplash-error-checking.patch [bz#1607367] +- kvm-fw_cfg-Fix-boot-reboot-timeout-error-checking.patch [bz#1607367] +- kvm-hw-nvram-fw_cfg-Store-reboot-timeout-as-little-endia.patch [bz#1607367] +- kvm-intel_iommu-Correct-caching-mode-error-message.patch [bz#1738440] +- kvm-intel_iommu-Sanity-check-vfio-pci-config-on-machine-.patch [bz#1738440] +- kvm-qdev-machine-Introduce-hotplug_allowed-hook.patch [bz#1738440] +- kvm-pc-q35-Disallow-vfio-pci-hotplug-without-VT-d-cachin.patch [bz#1738440] +- kvm-intel_iommu-Remove-the-caching-mode-check-during-fla.patch [bz#1738440] +- kvm-pseries-do-not-allow-memory-less-cpu-less-NUMA-node.patch [bz#1651474] +- Resolves: bz#1607367 + (After boot failed, guest should not reboot when set reboot-timeout < -1) +- Resolves: bz#1651474 + (RHEL8.0 Beta - [4.18.0-32.el8.ppc64le] Guest VM crashes during vcpu hotplug with specific numa configuration (kvm)) +- Resolves: bz#1660909 + ([IBM 8.2 FEAT] KVM s390x: Crypto Passthrough Interrupt Support - qemu part) +- Resolves: bz#1708459 + (qemu-kvm core dumped when repeat "system_reset" multiple times during guest boot) +- Resolves: bz#1738440 + (For intel-iommu, qemu shows conflict behaviors between booting a guest with vfio and hot plugging vfio device) +- Resolves: bz#1746361 + (ccid: Fix incorrect dwProtocol advertisement of T=0) +- Resolves: bz#1749022 + (Please backport 950c4e6c94b1 ("opts: don't silently truncate long option values", 2018-05-09)) +- Resolves: bz#1749724 + (CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-8]) +- Resolves: bz#1754643 + (RHEL8.1 Snapshot3 - Passthrough PCI card goes into error state if used in domain (kvm)) + +* Fri Sep 13 2019 Danilo Cesar Lemes de Paula - 2.12.0-88.el8 +- Revert fix for bz#1749724 - this got delayed to 8.2 + (CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-8]) + +* Tue Sep 03 2019 Danilo Cesar Lemes de Paula - 2.12.0-86.el8 +- kvm-Do-not-run-iotests-on-brew-build.patch [bz#1742819] +- kvm-target-ppc-spapr-Add-workaround-option-to-SPAPR_CAP_.patch [bz#1744415] +- kvm-target-ppc-spapr-Add-SPAPR_CAP_CCF_ASSIST.patch [bz#1744415] +- kvm-i386-x86_cpu_list_feature_names-function.patch [bz#1747185] +- kvm-i386-unavailable-features-QOM-property.patch [bz#1747185] +- kvm-file-posix-Handle-undetectable-alignment.patch [bz#1738839] +- kvm-iotests-Tweak-221-sizing-for-different-hole-granular.patch [bz#1738839] +- kvm-iotests-Filter-175-s-allocation-information.patch [bz#1738839] +- kvm-block-posix-Always-allocate-the-first-block.patch [bz#1738839] +- kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch [bz#1738839] +- Resolves: bz#1738839 + (I/O error when virtio-blk disk is backed by a raw image on 4k disk) +- Resolves: bz#1742819 + (Remove iotests from qemu-kvm builds [RHEL 8.1.0]) +- Resolves: bz#1744415 + (Backport support for count cache flush Spectre v2 mitigation [slow train]) +- Resolves: bz#1747185 + ("filtered-features" QOM property is not available) + +* Mon Aug 19 2019 Danilo Cesar Lemes de Paula - 2.12.0-85.el8 +- kvm-console-Avoid-segfault-in-screendump.patch [bz#1684383] +- kvm-usb-hub-clear-suspend-on-detach.patch [bz#1619661] +- kvm-qemu-img-fix-regression-copying-secrets-during-conve.patch [bz#1727821] +- Resolves: bz#1619661 + (the attach hub on one hub still exits in device manager after unhotplug) +- Resolves: bz#1684383 + (qemu crashed when take screenshot for 2nd head of virtio video device if the display not opened by virt-viewer) +- Resolves: bz#1727821 + (Failed to convert a source image to the qcow2 image encrypted by luks) + +* Fri Aug 16 2019 Danilo Cesar Lemes de Paula - 2.12.0-84.el8 +- kvm-vnc-detect-and-optimize-pageflips.patch [bz#1727033] +- kvm-block-backend-Make-blk_inc-dec_in_flight-public.patch [bz#1716349] +- kvm-virtio-blk-Increase-in_flight-for-request-restart-BH.patch [bz#1716349] +- kvm-block-Fix-AioContext-switch-for-drained-node.patch [bz#1716349] +- kvm-test-bdrv-drain-AioContext-switch-in-drained-section.patch [bz#1716349] +- kvm-block-Use-normal-drain-for-bdrv_set_aio_context.patch [bz#1716349] +- kvm-block-Fix-AioContext-switch-for-bs-drv-NULL.patch [bz#1716347] +- kvm-iothread-fix-crash-with-invalid-properties.patch [bz#1687541] +- kvm-iothread-replace-init_done_cond-with-a-semaphore.patch [bz#1687541] +- kvm-RHEL-disable-hostmem-memfd.patch [bz#1740797] +- Resolves: bz#1687541 + (qemu aborted when start guest with a big iothreads) +- Resolves: bz#1716347 + (Qemu Core dump when quit vm that's in status "paused(io-error)" with data plane enabled) +- Resolves: bz#1716349 + (qemu with iothreads enabled crashes on resume after enospc pause for disk extension) +- Resolves: bz#1727033 + (vnc server should detect page-flips and avoid sending fullscreen updates then.) +- Resolves: bz#1740797 + (Disable memfd in QEMU) + +* Thu Aug 01 2019 Danilo Cesar Lemes de Paula - 2.12.0-83.el8 +- kvm-hw-block-pflash_cfi01-Add-missing-DeviceReset-handle.patch [bz#1707192] +- kvm-block-file-posix-Unaligned-O_DIRECT-block-status.patch [bz#1678979] +- kvm-iotests-Test-unaligned-raw-images-with-O_DIRECT.patch [bz#1678979] +- kvm-nbd-client-Lower-min_block-for-block-status-unaligne.patch [bz#1678979] +- kvm-nbd-client-Reject-inaccessible-tail-of-inconsistent-.patch [bz#1678979] +- kvm-nbd-client-Support-qemu-img-convert-from-unaligned-s.patch [bz#1678979] +- kvm-block-Add-bdrv_get_request_alignment.patch [bz#1678979] +- kvm-nbd-server-Advertise-actual-minimum-block-size.patch [bz#1678979] +- kvm-slirp-check-sscanf-result-when-emulating-ident.patch [bz#1727642] +- kvm-slirp-fix-big-little-endian-conversion-in-ident-prot.patch [bz#1727642] +- kvm-slirp-ensure-there-is-enough-space-in-mbuf-to-null-t.patch [bz#1727642] +- kvm-slirp-don-t-manipulate-so_rcv-in-tcp_emu.patch [bz#1727642] +- kvm-tap-set-vhostfd-passed-from-qemu-cli-to-non-blocking.patch [bz#1732642] +- kvm-Fix-heap-overflow-in-ip_reass-on-big-packet-input.patch [bz#1734751] +- Resolves: bz#1678979 + (qemu-img convert abort when converting image with unaligned size (qemu-img: block/io.c:2134: bdrv_co_block_status: Assertion `*pnum && (((*pnum) % (align)) == 0) && align > offset - aligned_offset\' failed)) +- Resolves: bz#1707192 + (implement missing reset handler for cfi.pflash01 - slow train) +- Resolves: bz#1727642 + (CVE-2019-6778 qemu-kvm: QEMU: slirp: heap buffer overflow in tcp_emu()) +- Resolves: bz#1732642 + (enable the virtio-net frontend to work with the vhost-net backend in SEV guests) +- Resolves: bz#1734751 + (CVE-2019-14378 qemu-kvm: QEMU: slirp: heap buffer overflow during packet reassembly [rhel-8.1.0]) + +* Tue Jul 23 2019 Danilo Cesar Lemes de Paula - 2.12.0-82.el8 +- kvm-i386-Add-new-model-of-Cascadelake-Server.patch [bz#1629906] +- kvm-i386-Update-stepping-of-Cascadelake-Server.patch [bz#1629906] +- kvm-target-i386-Disable-MPX-support-on-named-CPU-models.patch [bz#1629906] +- kvm-i386-remove-the-INTEL_PT-CPUID-bit-from-named-CPU-NEW.patch [bz#1629906] +- kvm-i386-Disable-OSPKE-on-CPU-model-definitions-NEW.patch [bz#1629906] +- kvm-block-ssh-Convert-from-DPRINTF-macro-to-trace-events.patch [bz#1513367] +- kvm-block-ssh-Do-not-report-read-write-flush-errors-to-t.patch [bz#1513367] +- kvm-qemu-iotests-Fix-paths-for-NFS.patch [bz#1513367] +- kvm-qemu-iotests-Filter-NFS-paths.patch [bz#1513367] +- kvm-iotests-Filter-SSH-paths.patch [bz#1513367] +- kvm-block-ssh-Implement-.bdrv_refresh_filename.patch [bz#1513367] +- kvm-iotests-Use-Python-byte-strings-where-appropriate.patch [bz#1513367] +- kvm-iotests-Unify-log-outputs-between-Python-2-and-3.patch [bz#1513367] +- kvm-ssh-switch-from-libssh2-to-libssh.patch [bz#1513367] +- kvm-redhat-switch-from-libssh2-to-libssh.patch [bz#1513367] +- kvm-block-gluster-limit-the-transfer-size-to-512-MiB.patch [bz#1728657] +- kvm-s390-cpumodel-fix-description-for-the-new-vector-fac.patch [bz#1729975] +- kvm-s390x-cpumodel-remove-esort-from-the-default-model.patch [bz#1729975] +- kvm-s390x-cpumodel-also-change-name-of-vxbeh.patch [bz#1729975] +- kvm-s390x-cpumodel-change-internal-name-of-vxpdeh-to-mat.patch [bz#1729975] +- kvm-target-i386-sev-Do-not-unpin-ram-device-memory-regio.patch [bz#1728958] +- kvm-i386-Save-EFER-for-32-bit-targets.patch [bz#1689269] +- kvm-target-i386-rename-HF_SVMI_MASK-to-HF_GUEST_MASK.patch [bz#1689269] +- kvm-target-i386-kvm-add-VMX-migration-blocker.patch [bz#1689269] +- kvm-target-i386-kvm-just-return-after-migrate_add_blocke.patch [bz#1689269] +- kvm-target-i386-kvm-Delete-VMX-migration-blocker-on-vCPU.patch [bz#1689269] +- kvm-Introduce-kvm_arch_destroy_vcpu.patch [bz#1689269] +- kvm-target-i386-kvm-Use-symbolic-constant-for-DB-BP-exce.patch [bz#1689269] +- kvm-target-i386-kvm-Re-inject-DB-to-guest-with-updated-D.patch [bz#1689269] +- kvm-target-i386-kvm-Block-migration-for-vCPUs-exposed-wi.patch [bz#1689269] +- kvm-target-i386-kvm-do-not-initialize-padding-fields.patch [bz#1689269] +- kvm-linux-headers-synchronize-generic-and-x86-KVM-header.patch [bz#1689269] +- kvm-vmstate-Add-support-for-kernel-integer-types.patch [bz#1689269] +- kvm-target-i386-kvm-Add-support-for-save-and-restore-nes.patch [bz#1689269] +- kvm-target-i386-kvm-Add-support-for-KVM_CAP_EXCEPTION_PA.patch [bz#1689269] +- kvm-target-i386-kvm-Add-nested-migration-blocker-only-wh.patch [bz#1689269] +- kvm-target-i386-kvm-Demand-nested-migration-kernel-capab.patch [bz#1689269] +- kvm-target-i386-skip-KVM_GET-SET_NESTED_STATE-if-VMX-dis.patch [bz#1689269] +- kvm-i386-kvm-Do-not-sync-nested-state-during-runtime.patch [bz#1689269] +- Resolves: bz#1513367 + (qemu with libssh) +- Resolves: bz#1629906 + ([Intel 8.1 Feat] qemu-kvm Introduce Cascade Lake (CLX) cpu model) +- Resolves: bz#1689269 + (Nested KVM: support for migration of nested hypervisors - Slow Train) +- Resolves: bz#1728657 + ('qemu-io write' to a raw image over libgfapi fails) +- Resolves: bz#1728958 + (Hot unplug vfio-pci NIC devices from sev guest will cause qemu-kvm: sev_ram_block_removed: failed to unregister region) +- Resolves: bz#1729975 + (RHEL 8.1 Pre-Beta - Fix for hardware CPU Model) + +* Mon Jul 08 2019 Miroslav Rezanina - 2.12.0-81.el8 +- kvm-target-i386-add-MDS-NO-feature.patch [bz#1714792] +- kvm-virtio-gpu-pass-down-VirtIOGPU-pointer-to-a-bunch-of.patch [bz#1531543] +- kvm-virtio-gpu-add-iommu-support.patch [bz#1531543] +- kvm-virtio-gpu-fix-unmap-in-error-path.patch [bz#1531543] +- Resolves: bz#1531543 + ([RFE] add iommu support to virtio-gpu) +- Resolves: bz#1714792 + ([Intel 8.1 FEAT] MDS_NO exposure to guest) + +* Tue Jul 02 2019 Danilo Cesar Lemes de Paula - 2.12.0-80.el8 +- kvm-qxl-check-release-info-object.patch [bz#1712705] +- kvm-iotests-Make-182-do-without-device_add.patch [bz#1707598] +- Resolves: bz#1707598 + (qemu-iotest 182 fails without device hotplugging support) +- Resolves: bz#1712705 + (CVE-2019-12155 qemu-kvm: QEMU: qxl: null pointer dereference while releasing spice resources [rhel-8]) + +* Fri Jun 28 2019 Danilo de Paula - 15:2.12.0-79 +- Rebuild all virt packages to fix RHEL's upgrade path +- Resolves: rhbz#1695587 + (Ensure modular RPM upgrade path) + +* Thu Jun 20 2019 Miroslav Rezanina - 2.12.0-78.el8 +- kvm-gluster-Handle-changed-glfs_ftruncate-signature.patch [bz#1721983] +- kvm-gluster-the-glfs_io_cbk-callback-function-pointer-ad.patch [bz#1721983] +- Resolves: bz#1721983 + (qemu-kvm can't be build with new gluster version (6.0.6)) + +* Thu Jun 13 2019 Danilo Cesar Lemes de Paula - 2.12.0-77.el8 +- kvm-i386-Make-arch_capabilities-migratable.patch [bz#1709970] +- kvm-spapr-Fix-ibm-max-associativity-domains-property-num.patch [bz#1710662] +- kvm-linux-headers-Update-for-NVLink2-passthrough-downstr.patch [bz#1710662] +- kvm-pci-Move-NVIDIA-vendor-id-to-the-rest-of-ids.patch [bz#1710662] +- kvm-vfio-quirks-Add-common-quirk-alloc-helper.patch [bz#1710662] +- kvm-vfio-Make-vfio_get_region_info_cap-public.patch [bz#1710662] +- kvm-spapr-Support-NVIDIA-V100-GPU-with-NVLink2.patch [bz#1710662] +- kvm-qemu-kvm.spec-bump-libseccomp-2.4.0.patch [bz#1719578] +- Resolves: bz#1709970 + ([Intel 8.1 Bug] [KVM][CLX] CPUID_7_0_EDX_ARCH_CAPABILITIES is not enabled in VM - qemu-kvm) +- Resolves: bz#1710662 + ([IBM 8.1 FEAT] POWER9 - Virt: qemu: NVLink2 passthru to guest - Nvidia Volta (GPU) (kvm)) +- Resolves: bz#1719578 + (VM failed to start with error "failed to install seccomp syscall filter in the kernel") + +* Tue Jun 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-76.el8 +- kvm-Introduce-new-no_guest_reset-parameter-for-usb-host-.patch [bz#1713677] +- kvm-usb-call-reset-handler-before-updating-state.patch [bz#1713677] +- kvm-usb-host-skip-reset-for-untouched-devices.patch [bz#1713677] +- kvm-usb-host-avoid-libusb_set_configuration-calls.patch [bz#1713677] +- kvm-virtio-scsi-Move-BlockBackend-back-to-the-main-AioCo.patch [bz#1673396 bz#1673401] +- kvm-scsi-disk-Acquire-the-AioContext-in-scsi_-_realize.patch [bz#1673396 bz#1673401] +- kvm-virtio-scsi-Forbid-devices-with-different-iothreads-.patch [bz#1673396 bz#1673401] +- kvm-Disable-VXHS-support.patch [bz#1714933] +- Resolves: bz#1673396 + (qemu-kvm core dumped after hotplug the deleted disk with iothread parameter) +- Resolves: bz#1673401 + (Qemu core dump when start guest with two disks using same drive) +- Resolves: bz#1713677 + (Detached device when trying to upgrade USB device firmware when in doing USB Passthrough via QEMU) +- Resolves: bz#1714933 + (Disable VXHS in qemu-kvm) + +* Fri May 24 2019 Danilo Cesar Lemes de Paula - 2.12.0-75.el8 +- kvm-s390x-cpumodel-enum-type-S390FeatGroup-now-gets-gene.patch [bz#1660912] +- kvm-linux-headers-update-against-Linux-5.2-rc1.patch [bz#1660912] +- kvm-s390x-cpumodel-ignore-csske-for-expansion.patch [bz#1660912] +- kvm-s390x-cpumodel-Miscellaneous-Instruction-Extensions-.patch [bz#1660912] +- kvm-s390x-cpumodel-msa9-facility.patch [bz#1660912] +- kvm-s390x-cpumodel-vector-enhancements.patch [bz#1660912] +- kvm-s390x-cpumodel-enhanced-sort-facility.patch [bz#1660912] +- kvm-s390x-cpumodel-add-Deflate-conversion-facility.patch [bz#1660912] +- kvm-s390x-cpumodel-add-gen15-defintions.patch [bz#1660912] +- kvm-s390x-cpumodel-wire-up-8561-and-8562-as-gen15-machin.patch [bz#1660912] +- kvm-spice-set-device-address-and-device-display-ID-in-QX.patch [bz#1712946] +- kvm-hw-pci-Add-missing-include.patch [bz#1712946] +- Resolves: bz#1660912 + ([IBM 8.1 FEAT] KVM s390x: Add hardware CPU Model - qemu part) +- Resolves: bz#1712946 + (qemu-kvm build is broken due to spice_qxl_set_max_monitors being deprecated) + +* Mon May 20 2019 Danilo Cesar Lemes de Paula - 2.12.0-74.el8 +- kvm-x86-cpu-Enable-CLDEMOTE-Demote-Cache-Line-cpu-featur.patch [bz#1696436] +- kvm-memory-Fix-the-memory-region-type-assignment-order.patch [bz#1667249] +- kvm-target-i386-sev-Do-not-pin-the-ram-device-memory-reg.patch [bz#1667249] +- kvm-block-Fix-invalidate_cache-error-path-for-parent-act.patch [bz#1673010] +- kvm-target-i386-define-md-clear-bit.patch [bz#1703302 bz#1703308] +- Resolves: bz#1667249 + (Fail to launch AMD SEV VM with assigned PCI device) +- Resolves: bz#1673010 + (Local VM and migrated VM on the same host can run with same RAW file as visual disk source while without shareable configured or lock manager enabled) +- Resolves: bz#1696436 + ([Intel 8.0 Feat] KVM Enabling SnowRidge new NIs - qemu-kvm) +- Resolves: bz#1703302 + (CVE-2018-12130 virt:rhel/qemu-kvm: hardware: Microarchitectural Fill Buffer Data Sampling (MFBDS) [rhel-8]) +- Resolves: bz#1703308 + (CVE-2018-12127 virt:rhel/qemu-kvm: hardware: Micro-architectural Load Port Data Sampling - Information Leak (MLPDS) [rhel-8]) + +* Tue May 14 2019 Danilo Cesar Lemes de Paula - 2.12.0-73.el8 +- kvm-i386-remove-the-INTEL_PT-CPUID-bit-from-named-CPU-mo.patch [bz#1561761] +- kvm-i386-Disable-OSPKE-on-CPU-model-definitions.patch [bz#1561761] +- Resolves: bz#1561761 + ([Intel 8.1 Feat] qemu-kvm Introduce Icelake cpu model) + +* Tue May 14 2019 Danilo Cesar Lemes de Paula - 2.12.0-72.el8 +- kvm-Use-KVM_GET_MSR_INDEX_LIST-for-MSR_IA32_ARCH_CAP.patch [bz#1707706] +- kvm-i386-kvm-Disable-arch_capabilities-if-MSR-can-t-be-s.patch [bz#1707706] +- Resolves: bz#1707706 + (/builddir/build/BUILD/qemu-2.12.0/target/i386/kvm.c:2031: kvm_put_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed.) + +* Wed May 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-71.el8 +- kvm-s390-bios-Skip-bootmap-signature-entries.patch [bz#1683275] +- Resolves: bz#1683275 + ([IBM 8.1 FEAT] KVM: Secure Linux Boot Toleration (qemu)) + +* Tue May 07 2019 Danilo Cesar Lemes de Paula - 2.12.0-70.el8 +- kvm-i386-Add-new-MSR-indices-for-IA32_PRED_CMD-and-IA32_.patch [bz#1561761] +- kvm-i386-Add-CPUID-bit-and-feature-words-for-IA32_ARCH_C.patch [bz#1561761] +- kvm-i386-Add-CPUID-bit-for-PCONFIG.patch [bz#1561761] +- kvm-i386-Add-CPUID-bit-for-WBNOINVD.patch [bz#1561761] +- kvm-i386-Add-new-CPU-model-Icelake-Server-Client.patch [bz#1561761] +- kvm-Add-support-to-KVM_GET_MSR_FEATURE_INDEX_LIST-an.patch [bz#1561761] +- kvm-x86-Data-structure-changes-to-support-MSR-based-feat.patch [bz#1561761] +- kvm-x86-define-a-new-MSR-based-feature-word-FEATURE_WORD.patch [bz#1561761] +- kvm-i386-remove-the-new-CPUID-PCONFIG-from-Icelake-Serve.patch [bz#1561761] +- kvm-Revert-i386-Add-CPUID-bit-for-PCONFIG.patch [bz#1561761] +- Resolves: bz#1561761 + ([Intel 8.1 Feat] qemu-kvm Introduce Icelake cpu model) + +* Fri May 03 2019 Danilo Cesar Lemes de Paula - 2.12.0-69.el8 +- kvm-tests-crypto-Use-the-IEC-binary-prefix-definitions.patch [bz#1680231] +- kvm-crypto-expand-algorithm-coverage-for-cipher-benchmar.patch [bz#1680231] +- kvm-crypto-remove-code-duplication-in-tweak-encrypt-decr.patch [bz#1680231] +- kvm-crypto-introduce-a-xts_uint128-data-type.patch [bz#1680231] +- kvm-crypto-convert-xts_tweak_encdec-to-use-xts_uint128-t.patch [bz#1680231] +- kvm-crypto-convert-xts_mult_x-to-use-xts_uint128-type.patch [bz#1680231] +- kvm-crypto-annotate-xts_tweak_encdec-as-inlineable.patch [bz#1680231] +- kvm-crypto-refactor-XTS-cipher-mode-test-suite.patch [bz#1680231] +- kvm-crypto-add-testing-for-unaligned-buffers-with-XTS-ci.patch [bz#1680231] +- Resolves: bz#1680231 + (severe performance impact using luks format) + +* Mon Apr 29 2019 Danilo Cesar Lemes de Paula - 2.12.0-68.el8 +- kvm-s390x-ipl-Try-to-detect-Linux-vs-non-Linux-for-initi.patch [bz#1699070] +- kvm-loader-Check-access-size-when-calling-rom_ptr-to-avo.patch [bz#1699070] +- kvm-hw-s390x-Use-the-IEC-binary-prefix-definitions.patch [bz#1699070] +- kvm-s390x-storage-attributes-fix-CMMA_BLOCK_SIZE-usage.patch [bz#1699070] +- kvm-s390x-cpumodel-fix-segmentation-fault-when-baselinin.patch [bz#1699070] +- kvm-hw-s390x-s390-pci-bus-Convert-sysbus-init-function-t.patch [bz#1699070] +- kvm-s390x-pci-properly-fail-if-the-zPCI-device-cannot-be.patch [bz#1699070] +- kvm-s390x-pci-rename-hotplug-handler-callbacks.patch [bz#1699070] +- kvm-s390-avoid-potential-null-dereference-in-s390_pcihos.patch [bz#1699070] +- kvm-s390x-pci-Send-correct-event-on-hotplug.patch [bz#1699070] +- kvm-s390x-pci-Set-the-iommu-region-size-mpcifc-request.patch [bz#1699070] +- kvm-s390x-pci-Always-delete-and-free-the-release_timer.patch [bz#1699070] +- kvm-s390x-pci-Ignore-the-unplug-call-if-we-already-have-.patch [bz#1699070] +- kvm-s390x-pci-Use-hotplug_dev-instead-of-looking-up-the-.patch [bz#1699070] +- kvm-s390x-pci-Move-some-hotplug-checks-to-the-pre_plug-h.patch [bz#1699070] +- kvm-s390x-pci-Introduce-unplug-requests-and-split-unplug.patch [bz#1699070] +- kvm-s390x-pci-Drop-release-timer-and-replace-it-with-a-f.patch [bz#1699070] +- kvm-s390x-pci-mark-zpci-devices-as-unmigratable.patch [bz#1699070] +- kvm-s390x-pci-Fix-primary-bus-number-for-PCI-bridges.patch [bz#1699070] +- kvm-s390x-pci-Fix-hotplugging-of-PCI-bridges.patch [bz#1699070] +- kvm-s390x-pci-Warn-when-adding-PCI-devices-without-the-z.patch [bz#1699070] +- kvm-s390x-pci-Unplug-remaining-requested-devices-on-pcih.patch [bz#1699070] +- kvm-s390x-refactor-reset-reipl-handling.patch [bz#1699070] +- kvm-s390-ipl-fix-ipl-with-no-reboot.patch [bz#1699070] +- Resolves: bz#1699070 + (Backport s390x-related fixes for qemu-kvm) + +* Tue Apr 23 2019 Danilo Cesar Lemes de Paula - 2.12.0-67.el8 +- kvm-device_tree-Fix-integer-overflowing-in-load_device_t.patch [bz#1693116] +- Resolves: bz#1693116 + (CVE-2018-20815 qemu-kvm: QEMU: device_tree: heap buffer overflow while loading device tree blob [rhel-8.0]) + +* Mon Apr 15 2019 Danilo Cesar Lemes de Paula - 2.12.0-66.el8 +- kvm-iotests-153-Fix-dead-code.patch [bz#1694148] +- kvm-file-posix-Include-filename-in-locking-error-message.patch [bz#1694148] +- kvm-file-posix-Skip-effectiveless-OFD-lock-operations.patch [bz#1694148] +- kvm-file-posix-Drop-s-lock_fd.patch [bz#1694148] +- kvm-tests-Add-unit-tests-for-image-locking.patch [bz#1694148] +- kvm-file-posix-Fix-shared-locks-on-reopen-commit.patch [bz#1694148] +- kvm-iotests-Test-file-posix-locking-and-reopen.patch [bz#1694148] +- kvm-block-file-posix-do-not-fail-on-unlock-bytes.patch [bz#1694148] +- kvm-hostmem-file-remove-object-id-from-pmem-error-messag.patch [bz#1687596] +- kvm-redhat-setting-target-release-to-rhel-8.1.0.patch [] +- kvm-redhat-removing-iotest-182.patch [] +- Resolves: bz#1687596 + ([Intel 8.1 BUG][KVM][Crystal Ridge]object_get_canonical_path_component: assertion failed: (obj->parent != NULL)) +- Resolves: bz#1694148 + (QEMU image locking needn't double open fd number, and it should not fail when attempting to release locks) + +* Tue Apr 09 2019 Danilo Cesar Lemes de Paula - 2.12.0-65.el8 +- kvm-s390x-cpumodel-mepochptff-warn-when-no-mepoch-and-re.patch [bz#1664371] +- kvm-s390x-cpumodel-add-z14-GA2-model.patch [bz#1664371] +- kvm-redhat-s390x-cpumodel-enable-mepoch-by-default-for-z.patch [bz#1664371] +- kvm-intel_iommu-fix-operator-in-vtd_switch_address_space.patch [bz#1662272] +- kvm-intel_iommu-reset-intr_enabled-when-system-reset.patch [bz#1662272] +- kvm-pci-msi-export-msi_is_masked.patch [bz#1662272] +- kvm-i386-kvm-ignore-masked-irqs-when-update-msi-routes.patch [bz#1662272] +- Resolves: bz#1662272 + (Boot guest with device assignment+vIOMMU, qemu prompts "vtd_interrupt_remap_msi: MSI address low 32 bit invalid: 0x0" when first rebooting guest) +- Resolves: bz#1664371 + ([IBM 8.1 FEAT] Update hardware CPU Model z14 (kvm) - qemu part) + +* Mon Apr 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-64.el8 +- kvm-doc-fix-the-configuration-path.patch [bz#1645411] +- kvm-Increase-number-of-iotests-being-run-as-a-part-of-RH.patch [bz#1664463] +- kvm-Load-kvm-module-during-boot.patch [bz#1676907 bz#1685995] +- kvm-qemu-kvm.spec.template-Update-pyton-path-to-system-i.patch [] +- Resolves: bz#1645411 + (the "fsfreeze-hook" script path shown by command "qemu-ga --help" or "man qemu-ga" is wrong) +- Resolves: bz#1664463 + (Modify iotest behavior to include luks and nbd and fail build if iotests fail) +- Resolves: bz#1676907 + (/dev/kvm device exists but kernel module is not loaded on boot up causing VM start to fail in libvirt) +- Resolves: bz#1685995 + (/dev/kvm device exists but kernel module is not loaded on boot up causing VM start to fail in libvirt) + +* Tue Feb 26 2019 Danilo Cesar Lemes de Paula - 2.12.0-63.el8 +- kvm-scsi-generic-avoid-possible-out-of-bounds-access-to-.patch [bz#1668162] +- Resolves: bz#1668162 + (CVE-2019-6501 qemu-kvm: QEMU: scsi-generic: possible OOB access while handling inquiry request [rhel-8]) + +* Mon Feb 25 2019 Danilo Cesar Lemes de Paula - 2.12.0-62.el8 +- kvm-slirp-check-data-length-while-emulating-ident-functi.patch [bz#1669069] +- Resolves: bz#1669069 + (CVE-2019-6778 qemu-kvm: QEMU: slirp: heap buffer overflow in tcp_emu() [rhel-8.0]) + +* Mon Feb 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-61.el8 +- kvm-qemu-ga-make-get-fsinfo-work-over-pci-bridges.patch [bz#1666952] +- kvm-qga-fix-driver-leak-in-guest-get-fsinfo.patch [bz#1666952] +- Resolves: bz#1666952 + (qemu-guest-agent does not parse PCI bridge links in "build_guest_fsinfo_for_real_device" (q35)) + +* Mon Jan 28 2019 Danilo Cesar Lemes de Paula - 2.12.0-60.el8 +- kvm-ne2000-fix-possible-out-of-bound-access-in-ne2000_re.patch [bz#1636784] +- kvm-rtl8139-fix-possible-out-of-bound-access.patch [bz#1636784] +- kvm-pcnet-fix-possible-buffer-overflow.patch [bz#1636784] +- kvm-net-ignore-packet-size-greater-than-INT_MAX.patch [bz#1636784] +- kvm-net-drop-too-large-packet-early.patch [bz#1636784] +- kvm-net-hub-suppress-warnings-of-no-host-network-for-qte.patch [bz#1636784] +- kvm-virtio-net-test-accept-variable-length-argument-in-p.patch [bz#1636784] +- kvm-virtio-net-test-remove-unused-macro.patch [bz#1636784] +- kvm-virtio-net-test-add-large-tx-buffer-test.patch [bz#1636784] +- kvm-s390x-Return-specification-exception-for-unimplement.patch [bz#1668261] +- kvm-cpus-ignore-ESRCH-in-qemu_cpu_kick_thread.patch [bz#1665844] +- Resolves: bz#1636784 + (CVE-2018-17963 qemu-kvm: Qemu: net: ignore packets with large size [rhel-8]) +- Resolves: bz#1665844 + (Guest quit with error when hotunplug cpu) +- Resolves: bz#1668261 + ([RHEL8] Backport diag308 stable exception fix (qemu-kvm)) + +* Thu Jan 24 2019 Danilo Cesar Lemes de Paula - 2.12.0-59.el8 +- kvm-hw-scsi-cleanups-before-VPD-BL-emulation.patch [bz#1639957] +- kvm-hw-scsi-centralize-SG_IO-calls-into-single-function.patch [bz#1639957] +- kvm-hw-scsi-add-VPD-Block-Limits-emulation.patch [bz#1639957] +- kvm-scsi-disk-Block-Device-Characteristics-emulation-fix.patch [bz#1639957] +- kvm-scsi-generic-keep-VPD-page-list-sorted.patch [bz#1639957] +- kvm-scsi-generic-avoid-out-of-bounds-access-to-VPD-page-.patch [bz#1639957] +- kvm-scsi-generic-avoid-invalid-access-to-struct-when-emu.patch [bz#1639957] +- kvm-scsi-generic-do-not-do-VPD-emulation-for-sense-other.patch [bz#1639957] +- Resolves: bz#1639957 + ([RHEL.8] scsi host device passthrough limits IO writes - slow train) + +* Mon Jan 21 2019 Danilo Cesar Lemes de Paula - 2.12.0-58.el8 +- kvm-block-Update-flags-in-bdrv_set_read_only.patch [bz#1644996] +- kvm-block-Add-auto-read-only-option.patch [bz#1644996] +- kvm-rbd-Close-image-in-qemu_rbd_open-error-path.patch [bz#1644996] +- kvm-block-Require-auto-read-only-for-existing-fallbacks.patch [bz#1644996] +- kvm-nbd-Support-auto-read-only-option.patch [bz#1644996] +- kvm-file-posix-Support-auto-read-only-option.patch [bz#1644996] +- kvm-curl-Support-auto-read-only-option.patch [bz#1644996] +- kvm-gluster-Support-auto-read-only-option.patch [bz#1644996] +- kvm-iscsi-Support-auto-read-only-option.patch [bz#1644996] +- kvm-block-Make-auto-read-only-on-default-for-drive.patch [bz#1644996] +- kvm-qemu-iotests-Test-auto-read-only-with-drive-and-bloc.patch [bz#1644996] +- kvm-block-Fix-update-of-BDRV_O_AUTO_RDONLY-in-update_fla.patch [bz#1644996] +- kvm-qemu-img-Add-C-option-for-convert-with-copy-offloadi.patch [bz#1623082] +- kvm-iotests-Add-test-for-qemu-img-convert-C-compatibilit.patch [bz#1623082] +- Resolves: bz#1623082 + ([rhel.8.0]Target files for 'qemu-img convert' do not support thin_provisoning with iscsi/nfs backend) +- Resolves: bz#1644996 + (block-commit can't be used with -blockdev) + +* Fri Jan 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-57.el8 +- kvm-qemu-kvm.spec.template-Update-files-for-tests-rpm-to.patch [bz#1601107] + +* Fri Jan 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-56.el8 +- kvm-Run-iotests-as-part-of-the-build-process.patch [bz#1661026] +- kvm-Introduce-the-qemu-kvm-tests-rpm.patch [bz#1601107] +- Resolves: bz#1601107 + (qemu-kvm packaging: make running qemu-iotests more robust) +- Resolves: bz#1661026 + (Run iotests as part of build process) + +* Tue Jan 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-55.el8 +- kvm-block-Don-t-inactivate-children-before-parents.patch [bz#1659395] +- kvm-iotests-Test-migration-with-blockdev.patch [bz#1659395] +- Resolves: bz#1659395 + (src qemu core dump when do migration ( block device node-name changed after change cdrom) - Slow Train) + +* Tue Jan 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-54.el8 +- kvm-s390x-tcg-avoid-overflows-in-time2tod-tod2time.patch [bz#1653569] +- kvm-s390x-kvm-pass-values-instead-of-pointers-to-kvm_s39.patch [bz#1653569] +- kvm-s390x-tod-factor-out-TOD-into-separate-device.patch [bz#1653569] +- kvm-s390x-tcg-drop-tod_basetime.patch [bz#1653569] +- kvm-s390x-tcg-properly-implement-the-TOD.patch [bz#1653569] +- kvm-s390x-tcg-SET-CLOCK-COMPARATOR-can-clear-CKC-interru.patch [bz#1653569] +- kvm-s390x-tcg-implement-SET-CLOCK.patch [bz#1653569] +- kvm-s390x-tcg-rearm-the-CKC-timer-during-migration.patch [bz#1653569] +- kvm-s390x-tcg-fix-locking-problem-with-tcg_s390_tod_upda.patch [bz#1653569] +- kvm-hw-s390x-Include-the-tod-qemu-also-for-builds-with-d.patch [bz#1653569] +- kvm-s390x-tod-Properly-stop-the-KVM-TOD-while-the-guest-.patch [bz#1653569] +- kvm-hw-s390x-Fix-bad-mask-in-time2tod.patch [bz#1653569] +- kvm-migration-discard-non-migratable-RAMBlocks.patch [bz#1539285] +- kvm-vfio-pci-do-not-set-the-PCIDevice-has_rom-attribute.patch [bz#1539285] +- kvm-memory-exec-Expose-all-memory-block-related-flags.patch [bz#1539285] +- kvm-memory-exec-switch-file-ram-allocation-functions-to-.patch [bz#1539285] +- kvm-configure-add-libpmem-support.patch [bz#1539285] +- kvm-hostmem-file-add-the-pmem-option.patch [bz#1539285] +- kvm-mem-nvdimm-ensure-write-persistence-to-PMEM-in-label.patch [bz#1539285] +- kvm-migration-ram-Add-check-and-info-message-to-nvdimm-p.patch [bz#1539285] +- kvm-migration-ram-ensure-write-persistence-on-loading-al.patch [bz#1539285] +- Resolves: bz#1539285 + ([Intel 8.0 Bug] [KVM][Crystal Ridge] Lack of data persistence guarantee of QEMU writes to host PMEM) +- Resolves: bz#1653569 + (Stress guest and stop it, then do live migration, guest hit call trace on destination end) + +* Tue Jan 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-53.el8 +- kvm-ui-add-qapi-parser-for-display.patch [bz#1652871] +- kvm-ui-switch-trivial-displays-to-qapi-parser.patch [bz#1652871] +- kvm-qapi-Add-rendernode-display-option-for-egl-headless.patch [bz#1652871] +- kvm-ui-Allow-specifying-rendernode-display-option-for-eg.patch [bz#1652871] +- kvm-qapi-add-query-display-options-command.patch [bz#1652871] +- Resolves: bz#1652871 + (QEMU doesn't expose rendernode option for egl-headless display type) + +* Fri Jan 04 2019 Danilo Cesar Lemes de Paula - 2.12.0-52.el8 +- kvm-Add-edk2-Requires-to-qemu-kvm.patch [bz#1654276] +- Resolves: bz#1654276 + (qemu-kvm: Should depend on the architecture-appropriate guest firmware) + +* Mon Dec 24 2018 Danilo Cesar Lemes de Paula - 2.12.0-51.el8 +- kvm-x86-host-phys-bits-limit-option.patch [bz#1598284] +- kvm-rhel-Set-host-phys-bits-limit-48-on-rhel-machine-typ.patch [bz#1598284] +- kvm-i386-do-not-migrate-MSR_SMI_COUNT-on-machine-types-2.patch [bz#1659565] +- kvm-pc-x-migrate-smi-count-to-PC_RHEL_COMPAT.patch [bz#1659565] +- kvm-slow-train-kvm-clear-out-KVM_ASYNC_PF_DELIVERY_AS_PF.patch [bz#1656829] +- Resolves: bz#1598284 + ([Intel 8.0 Alpha] physical bits should < 48 when host with 5level paging &EPT5 and qemu command with "-cpu qemu64" parameters.) +- Resolves: bz#1656829 + (8->7 migration failed: qemu-kvm: error: failed to set MSR 0x4b564d02 to 0x27fc13285) +- Resolves: bz#1659565 + (machine type: required compat flag x-migrate-smi-count=off) + +* Tue Dec 18 2018 Danilo Cesar Lemes de Paula - 2.12.0-51 +- kvm-Add-edk2-Requires-to-qemu-kvm.patch [bz#1654276] +- Resolves: bz#1654276 + (qemu-kvm: Should depend on the architecture-appropriate guest firmware) + +* Mon Dec 17 2018 Danilo Cesar Lemes de Paula - +- kvm-redhat-enable-tpmdev-passthrough.patch [bz#1654486] +- Resolves: bz#1654486 + ([RFE] enable TPM passthrough at compile time (qemu-kvm)) + +* Fri Dec 14 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-48 +- kvm-redhat-use-autopatch-instead-of-PATCHAPPLY.patch [bz#1613128] +- kvm-redhat-Removing-some-unused-build-flags-in-the-spec-.patch [bz#1613128] +- kvm-redhat-Fixing-rhev-ma-conflicts.patch [bz#1613126] +- kvm-redhat-Remove-_smp_mflags-cleanup-workaround-for-s39.patch [bz#1613128] +- kvm-redhat-Removing-dead-code-from-the-spec-file.patch [bz#1613128] +- kvm-i386-Add-stibp-flag-name.patch [bz#1639446] +- kvm-Add-functional-acceptance-tests-infrastructure.patch [bz#1655807] +- kvm-scripts-qemu.py-allow-adding-to-the-list-of-extra-ar.patch [bz#1655807] +- kvm-Acceptance-tests-add-quick-VNC-tests.patch [bz#1655807] +- kvm-scripts-qemu.py-introduce-set_console-method.patch [bz#1655807] +- kvm-Acceptance-tests-add-Linux-kernel-boot-and-console-c.patch [bz#1655807] +- kvm-Bootstrap-Python-venv-for-tests.patch [bz#1655807] +- kvm-Acceptance-tests-add-make-rule-for-running-them.patch [bz#1655807] +- Resolves: bz#1613126 + (Check and fix qemu-kvm-rhev and qemu-kvm-ma conflicts in qemu-kvm for rhel-8) +- Resolves: bz#1613128 + (Spec file clean up) +- Resolves: bz#1639446 + (Cross migration from RHEL7.5 to RHEL8 shouldn't fail with cpu flag stibp [qemu-kvm]) +- Resolves: bz#1655807 + (Backport avocado-qemu tests for QEMU 2.12) + +* Tue Dec 11 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-47 +- kvm-Disable-CONFIG_IPMI-and-CONFIG_I2C-for-ppc64.patch [bz#1640044] +- kvm-Disable-CONFIG_CAN_BUS-and-CONFIG_CAN_SJA1000.patch [bz#1640042] +- Resolves: bz#1640042 + (Disable CONFIG_CAN_BUS and CONFIG_CAN_SJA1000 config switches) +- Resolves: bz#1640044 + (Disable CONFIG_I2C and CONFIG_IPMI in default-configs/ppc64-softmmu.mak) + +* Tue Dec 11 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-46 +- kvm-qcow2-Give-the-refcount-cache-the-minimum-possible-s.patch [bz#1656507] +- kvm-docs-Document-the-new-default-sizes-of-the-qcow2-cac.patch [bz#1656507] +- kvm-qcow2-Fix-Coverity-warning-when-calculating-the-refc.patch [bz#1656507] +- kvm-include-Add-IEC-binary-prefixes-in-qemu-units.h.patch [bz#1656507] +- kvm-qcow2-Options-documentation-fixes.patch [bz#1656507] +- kvm-include-Add-a-lookup-table-of-sizes.patch [bz#1656507] +- kvm-qcow2-Make-sizes-more-humanly-readable.patch [bz#1656507] +- kvm-qcow2-Avoid-duplication-in-setting-the-refcount-cach.patch [bz#1656507] +- kvm-qcow2-Assign-the-L2-cache-relatively-to-the-image-si.patch [bz#1656507] +- kvm-qcow2-Increase-the-default-upper-limit-on-the-L2-cac.patch [bz#1656507] +- kvm-qcow2-Resize-the-cache-upon-image-resizing.patch [bz#1656507] +- kvm-qcow2-Set-the-default-cache-clean-interval-to-10-min.patch [bz#1656507] +- kvm-qcow2-Explicit-number-replaced-by-a-constant.patch [bz#1656507] +- kvm-block-backend-Set-werror-rerror-defaults-in-blk_new.patch [bz#1657637] +- kvm-qcow2-Fix-cache-clean-interval-documentation.patch [bz#1656507] +- Resolves: bz#1656507 + ([RHEL.8] qcow2 cache is too small) +- Resolves: bz#1657637 + (Wrong werror default for -device drive=) + +* Thu Dec 06 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-45 +- kvm-target-ppc-add-basic-support-for-PTCR-on-POWER9.patch [bz#1639069] +- kvm-linux-headers-Update-for-nested-KVM-HV-downstream-on.patch [bz#1639069] +- kvm-target-ppc-Add-one-reg-id-for-ptcr.patch [bz#1639069] +- kvm-ppc-spapr_caps-Add-SPAPR_CAP_NESTED_KVM_HV.patch [bz#1639069] +- kvm-Re-enable-CONFIG_HYPERV_TESTDEV.patch [bz#1651195] +- kvm-qxl-use-guest_monitor_config-for-local-renderer.patch [bz#1610163] +- kvm-Declare-cirrus-vga-as-deprecated.patch [bz#1651994] +- kvm-Do-not-build-bluetooth-support.patch [bz#1654651] +- kvm-vfio-helpers-Fix-qemu_vfio_open_pci-crash.patch [bz#1645840] +- kvm-balloon-Allow-multiple-inhibit-users.patch [bz#1650272] +- kvm-Use-inhibit-to-prevent-ballooning-without-synchr.patch [bz#1650272] +- kvm-vfio-Inhibit-ballooning-based-on-group-attachment-to.patch [bz#1650272] +- kvm-vfio-ccw-pci-Allow-devices-to-opt-in-for-ballooning.patch [bz#1650272] +- kvm-vfio-pci-Handle-subsystem-realpath-returning-NULL.patch [bz#1650272] +- kvm-vfio-pci-Fix-failure-to-close-file-descriptor-on-err.patch [bz#1650272] +- kvm-postcopy-Synchronize-usage-of-the-balloon-inhibitor.patch [bz#1650272] +- Resolves: bz#1610163 + (guest shows border blurred screen with some resolutions when qemu boot with -device qxl-vga ,and guest on rhel7.6 has no such question) +- Resolves: bz#1639069 + ([IBM 8.0 FEAT] POWER9 - Nested virtualization in RHEL8.0 KVM for ppc64le - qemu-kvm side) +- Resolves: bz#1645840 + (Qemu core dump when hotplug nvme:// drive via -blockdev) +- Resolves: bz#1650272 + (Ballooning is incompatible with vfio assigned devices, but not prevented) +- Resolves: bz#1651195 + (Re-enable hyperv-testdev device) +- Resolves: bz#1651994 + (Declare the "Cirrus VGA" device emulation of QEMU as deprecated in RHEL8) +- Resolves: bz#1654651 + (Qemu: hw: bt: keep bt/* objects from building [rhel-8.0]) + +* Tue Nov 27 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-44 +- kvm-block-Make-more-block-drivers-compile-time-configura.patch [bz#1598842 bz#1598842] +- kvm-RHEL8-Add-disable-configure-options-to-qemu-spec-fil.patch [bz#1598842] +- Resolves: bz#1598842 + (Compile out unused block drivers) + +* Mon Nov 26 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-43 +- kvm-configure-add-test-for-libudev.patch [bz#1636185] +- kvm-qga-linux-report-disk-serial-number.patch [bz#1636185] +- kvm-qga-linux-return-disk-device-in-guest-get-fsinfo.patch [bz#1636185] +- kvm-qemu-error-introduce-error-warn-_report_once.patch [bz#1625173] +- kvm-intel-iommu-start-to-use-error_report_once.patch [bz#1625173] +- kvm-intel-iommu-replace-more-vtd_err_-traces.patch [bz#1625173] +- kvm-intel_iommu-introduce-vtd_reset_caches.patch [bz#1625173] +- kvm-intel_iommu-better-handling-of-dmar-state-switch.patch [bz#1625173] +- kvm-intel_iommu-move-ce-fetching-out-when-sync-shadow.patch [bz#1625173 bz#1629616] +- kvm-intel_iommu-handle-invalid-ce-for-shadow-sync.patch [bz#1625173 bz#1629616] +- kvm-block-remove-bdrv_dirty_bitmap_make_anon.patch [bz#1518989] +- kvm-block-simplify-code-around-releasing-bitmaps.patch [bz#1518989] +- kvm-hbitmap-Add-advance-param-to-hbitmap_iter_next.patch [bz#1518989] +- kvm-test-hbitmap-Add-non-advancing-iter_next-tests.patch [bz#1518989] +- kvm-block-dirty-bitmap-Add-bdrv_dirty_iter_next_area.patch [bz#1518989] +- kvm-blockdev-backup-add-bitmap-argument.patch [bz#1518989] +- kvm-dirty-bitmap-switch-assert-fails-to-errors-in-bdrv_m.patch [bz#1518989] +- kvm-dirty-bitmap-rename-bdrv_undo_clear_dirty_bitmap.patch [bz#1518989] +- kvm-dirty-bitmap-make-it-possible-to-restore-bitmap-afte.patch [bz#1518989] +- kvm-blockdev-rename-block-dirty-bitmap-clear-transaction.patch [bz#1518989] +- kvm-qapi-add-transaction-support-for-x-block-dirty-bitma.patch [bz#1518989] +- kvm-block-dirty-bitmaps-add-user_locked-status-checker.patch [bz#1518989] +- kvm-block-dirty-bitmaps-fix-merge-permissions.patch [bz#1518989] +- kvm-block-dirty-bitmaps-allow-clear-on-disabled-bitmaps.patch [bz#1518989] +- kvm-block-dirty-bitmaps-prohibit-enable-disable-on-locke.patch [bz#1518989] +- kvm-block-backup-prohibit-backup-from-using-in-use-bitma.patch [bz#1518989] +- kvm-nbd-forbid-use-of-frozen-bitmaps.patch [bz#1518989] +- kvm-bitmap-Update-count-after-a-merge.patch [bz#1518989] +- kvm-iotests-169-drop-deprecated-autoload-parameter.patch [bz#1518989] +- kvm-block-qcow2-improve-error-message-in-qcow2_inactivat.patch [bz#1518989] +- kvm-bloc-qcow2-drop-dirty_bitmaps_loaded-state-variable.patch [bz#1518989] +- kvm-dirty-bitmaps-clean-up-bitmaps-loading-and-migration.patch [bz#1518989] +- kvm-iotests-improve-169.patch [bz#1518989] +- kvm-iotests-169-add-cases-for-source-vm-resuming.patch [bz#1518989] +- kvm-pc-dimm-turn-alignment-assert-into-check.patch [bz#1630116] +- Resolves: bz#1518989 + (RFE: QEMU Incremental live backup) +- Resolves: bz#1625173 + ([NVMe Device Assignment] Guest could not boot up with q35+iommu) +- Resolves: bz#1629616 + (boot guest with q35+vIOMMU+ device assignment, qemu terminal shows "qemu-kvm: VFIO_UNMAP_DMA: -22" when return assigned network devices from vfio driver to ixgbe in guest) +- Resolves: bz#1630116 + (pc_dimm_get_free_addr: assertion failed: (QEMU_ALIGN_UP(address_space_start, align) == address_space_start)) +- Resolves: bz#1636185 + ([RFE] Report disk device name and serial number (qemu-guest-agent on Linux)) + +* Mon Nov 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-42.el8 +- kvm-luks-Allow-share-rw-on.patch [bz#1629701] +- kvm-redhat-reenable-gluster-support.patch [bz#1599340] +- kvm-redhat-bump-libusb-requirement.patch [bz#1627970] +- Resolves: bz#1599340 + (Reenable glusterfs in qemu-kvm once BZ#1567292 gets fixed) +- Resolves: bz#1627970 + (symbol lookup error: /usr/libexec/qemu-kvm: undefined symbol: libusb_set_option) +- Resolves: bz#1629701 + ("share-rw=on" does not work for luks format image - Fast Train) + +* Tue Oct 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-41.el8 +- kvm-block-rbd-pull-out-qemu_rbd_convert_options.patch [bz#1635585] +- kvm-block-rbd-Attempt-to-parse-legacy-filenames.patch [bz#1635585] +- kvm-block-rbd-add-deprecation-documentation-for-filename.patch [bz#1635585] +- kvm-block-rbd-add-iotest-for-rbd-legacy-keyvalue-filenam.patch [bz#1635585] +- Resolves: bz#1635585 + (rbd json format of 7.6 is incompatible with 7.5) + +* Tue Oct 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-40.el8 +- kvm-vnc-call-sasl_server_init-only-when-required.patch [bz#1609327] +- kvm-nbd-server-fix-NBD_CMD_CACHE.patch [bz#1636142] +- kvm-nbd-fix-NBD_FLAG_SEND_CACHE-value.patch [bz#1636142] +- kvm-test-bdrv-drain-bdrv_drain-works-with-cross-AioConte.patch [bz#1637976] +- kvm-block-Use-bdrv_do_drain_begin-end-in-bdrv_drain_all.patch [bz#1637976] +- kvm-block-Remove-recursive-parameter-from-bdrv_drain_inv.patch [bz#1637976] +- kvm-block-Don-t-manually-poll-in-bdrv_drain_all.patch [bz#1637976] +- kvm-tests-test-bdrv-drain-bdrv_drain_all-works-in-corout.patch [bz#1637976] +- kvm-block-Avoid-unnecessary-aio_poll-in-AIO_WAIT_WHILE.patch [bz#1637976] +- kvm-block-Really-pause-block-jobs-on-drain.patch [bz#1637976] +- kvm-block-Remove-bdrv_drain_recurse.patch [bz#1637976] +- kvm-test-bdrv-drain-Add-test-for-node-deletion.patch [bz#1637976] +- kvm-block-Drain-recursively-with-a-single-BDRV_POLL_WHIL.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-node-deletion-in-subtree-recurs.patch [bz#1637976] +- kvm-block-Don-t-poll-in-parent-drain-callbacks.patch [bz#1637976] +- kvm-test-bdrv-drain-Graph-change-through-parent-callback.patch [bz#1637976] +- kvm-block-Defer-.bdrv_drain_begin-callback-to-polling-ph.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-that-bdrv_drain_invoke-doesn-t-.patch [bz#1637976] +- kvm-block-Allow-AIO_WAIT_WHILE-with-NULL-ctx.patch [bz#1637976] +- kvm-block-Move-bdrv_drain_all_begin-out-of-coroutine-con.patch [bz#1637976] +- kvm-block-ignore_bds_parents-parameter-for-drain-functio.patch [bz#1637976] +- kvm-block-Allow-graph-changes-in-bdrv_drain_all_begin-en.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-graph-changes-in-drain_all-sect.patch [bz#1637976] +- kvm-block-Poll-after-drain-on-attaching-a-node.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-bdrv_append-to-drained-node.patch [bz#1637976] +- kvm-block-linux-aio-acquire-AioContext-before-qemu_laio_.patch [bz#1637976] +- kvm-util-async-use-qemu_aio_coroutine_enter-in-co_schedu.patch [bz#1637976] +- kvm-job-Fix-nested-aio_poll-hanging-in-job_txn_apply.patch [bz#1637976] +- kvm-job-Fix-missing-locking-due-to-mismerge.patch [bz#1637976] +- kvm-blockjob-Wake-up-BDS-when-job-becomes-idle.patch [bz#1637976] +- kvm-aio-wait-Increase-num_waiters-even-in-home-thread.patch [bz#1637976] +- kvm-test-bdrv-drain-Drain-with-block-jobs-in-an-I-O-thre.patch [bz#1637976] +- kvm-test-blockjob-Acquire-AioContext-around-job_cancel_s.patch [bz#1637976] +- kvm-job-Use-AIO_WAIT_WHILE-in-job_finish_sync.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-AIO_WAIT_WHILE-in-completion-ca.patch [bz#1637976] +- kvm-block-Add-missing-locking-in-bdrv_co_drain_bh_cb.patch [bz#1637976] +- kvm-block-backend-Add-.drained_poll-callback.patch [bz#1637976] +- kvm-block-backend-Fix-potential-double-blk_delete.patch [bz#1637976] +- kvm-block-backend-Decrease-in_flight-only-after-callback.patch [bz#1637976] +- kvm-blockjob-Lie-better-in-child_job_drained_poll.patch [bz#1637976] +- kvm-block-Remove-aio_poll-in-bdrv_drain_poll-variants.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-nested-poll-in-bdrv_drain_poll_.patch [bz#1637976] +- kvm-job-Avoid-deadlocks-in-job_completed_txn_abort.patch [bz#1637976] +- kvm-test-bdrv-drain-AIO_WAIT_WHILE-in-job-.commit-.abort.patch [bz#1637976] +- kvm-test-bdrv-drain-Fix-outdated-comments.patch [bz#1637976] +- kvm-block-Use-a-single-global-AioWait.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-draining-job-source-child-and-p.patch [bz#1637976] +- kvm-qemu-img-Fix-assert-when-mapping-unaligned-raw-file.patch [bz#1639374] +- kvm-iotests-Add-test-221-to-catch-qemu-img-map-regressio.patch [bz#1639374] +- Resolves: bz#1609327 + (qemu-kvm[37046]: Could not find keytab file: /etc/qemu/krb5.tab: Unknown error 49408) +- Resolves: bz#1636142 + (qemu NBD_CMD_CACHE flaws impacting non-qemu NBD clients) +- Resolves: bz#1637976 + (Crashes and hangs with iothreads vs. block jobs) +- Resolves: bz#1639374 + (qemu-img map 'Aborted (core dumped)' when specifying a plain file) + +* Tue Oct 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-39.el8 +- kvm-linux-headers-update.patch [bz#1508142] +- kvm-s390x-cpumodel-Set-up-CPU-model-for-AP-device-suppor.patch [bz#1508142] +- kvm-s390x-kvm-enable-AP-instruction-interpretation-for-g.patch [bz#1508142] +- kvm-s390x-ap-base-Adjunct-Processor-AP-object-model.patch [bz#1508142] +- kvm-s390x-vfio-ap-Introduce-VFIO-AP-device.patch [bz#1508142] +- kvm-s390-doc-detailed-specifications-for-AP-virtualizati.patch [bz#1508142] +- Resolves: bz#1508142 + ([IBM 8.0 FEAT] KVM: Guest-dedicated Crypto Adapters - qemu part) + +* Mon Oct 15 2018 Danilo Cesar Lemes de Paula - 2.12.0-38.el8 +- kvm-Revert-hw-acpi-build-build-SRAT-memory-affinity-stru.patch [bz#1609235] +- kvm-add-udev-kvm-check.patch [bz#1552663] +- kvm-aio-posix-Don-t-count-ctx-notifier-as-progress-when-.patch [bz#1623085] +- kvm-aio-Do-aio_notify_accept-only-during-blocking-aio_po.patch [bz#1623085] +- kvm-aio-posix-fix-concurrent-access-to-poll_disable_cnt.patch [bz#1632622] +- kvm-aio-posix-compute-timeout-before-polling.patch [bz#1632622] +- kvm-aio-posix-do-skip-system-call-if-ctx-notifier-pollin.patch [bz#1632622] +- kvm-intel-iommu-send-PSI-always-even-if-across-PDEs.patch [bz#1450712] +- kvm-intel-iommu-remove-IntelIOMMUNotifierNode.patch [bz#1450712] +- kvm-intel-iommu-add-iommu-lock.patch [bz#1450712] +- kvm-intel-iommu-only-do-page-walk-for-MAP-notifiers.patch [bz#1450712] +- kvm-intel-iommu-introduce-vtd_page_walk_info.patch [bz#1450712] +- kvm-intel-iommu-pass-in-address-space-when-page-walk.patch [bz#1450712] +- kvm-intel-iommu-trace-domain-id-during-page-walk.patch [bz#1450712] +- kvm-util-implement-simple-iova-tree.patch [bz#1450712] +- kvm-intel-iommu-rework-the-page-walk-logic.patch [bz#1450712] +- kvm-i386-define-the-ssbd-CPUID-feature-bit-CVE-2018-3639.patch [bz#1633928] +- Resolves: bz#1450712 + (Booting nested guest with vIOMMU, the assigned network devices can not receive packets (qemu)) +- Resolves: bz#1552663 + (81-kvm-rhel.rules is no longer part of initscripts) +- Resolves: bz#1609235 + (Win2016 guest can't recognize pc-dimm hotplugged to node 0) +- Resolves: bz#1623085 + (VM doesn't boot from HD) +- Resolves: bz#1632622 + (~40% virtio_blk disk performance drop for win2012r2 guest when comparing qemu-kvm-rhev-2.12.0-9 with qemu-kvm-rhev-2.12.0-12) +- Resolves: bz#1633928 + (CVE-2018-3639 qemu-kvm: hw: cpu: speculative store bypass [rhel-8.0]) + +* Fri Oct 12 2018 Danilo Cesar Lemes de Paula - 2.12.0-37.el8 +- kvm-block-for-jobs-do-not-clear-user_paused-until-after-.patch [bz#1635583] +- kvm-iotests-Add-failure-matching-to-common.qemu.patch [bz#1635583] +- kvm-block-iotest-to-catch-abort-on-forced-blockjob-cance.patch [bz#1635583] +- Resolves: bz#1635583 + (Quitting VM causes qemu core dump once the block mirror job paused for no enough target space) + +* Fri Oct 12 2018 Danilo Cesar Lemes de Paula - 2.12.0-36.el8 +- kvm-check-Only-test-ivshm-when-it-is-compiled-in.patch [bz#1621817] +- kvm-Disable-ivshmem.patch [bz#1621817] +- kvm-mirror-Fail-gracefully-for-source-target.patch [bz#1637963] +- kvm-commit-Add-top-node-base-node-options.patch [bz#1637970] +- kvm-qemu-iotests-Test-commit-with-top-node-base-node.patch [bz#1637970] +- Resolves: bz#1621817 + (Disable IVSHMEM in RHEL 8) +- Resolves: bz#1637963 + (Segfault on 'blockdev-mirror' with same node as source and target) +- Resolves: bz#1637970 + (allow using node-names with block-commit) + +* Thu Oct 11 2018 Danilo Cesar Lemes de Paula - 2.12.0-35.el8 +- kvm-redhat-make-the-plugins-executable.patch [bz#1638304] +- Resolves: bz#1638304 + (the driver packages lack all the library Requires) + +* Thu Oct 11 2018 Danilo Cesar Lemes de Paula - 2.12.0-34.el8 +- kvm-seccomp-allow-sched_setscheduler-with-SCHED_IDLE-pol.patch [bz#1618356] +- kvm-seccomp-use-SIGSYS-signal-instead-of-killing-the-thr.patch [bz#1618356] +- kvm-seccomp-prefer-SCMP_ACT_KILL_PROCESS-if-available.patch [bz#1618356] +- kvm-configure-require-libseccomp-2.2.0.patch [bz#1618356] +- kvm-seccomp-set-the-seccomp-filter-to-all-threads.patch [bz#1618356] +- kvm-memory-cleanup-side-effects-of-memory_region_init_fo.patch [bz#1600365] +- Resolves: bz#1600365 + (QEMU core dumped when hotplug memory exceeding host hugepages and with discard-data=yes) +- Resolves: bz#1618356 + (qemu-kvm: Qemu: seccomp: blacklist is not applied to all threads [rhel-8]) + +* Fri Oct 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-33.el8 +- kvm-migration-postcopy-Clear-have_listen_thread.patch [bz#1608765] +- kvm-migration-cleanup-in-error-paths-in-loadvm.patch [bz#1608765] +- kvm-jobs-change-start-callback-to-run-callback.patch [bz#1632939] +- kvm-jobs-canonize-Error-object.patch [bz#1632939] +- kvm-jobs-add-exit-shim.patch [bz#1632939] +- kvm-block-commit-utilize-job_exit-shim.patch [bz#1632939] +- kvm-block-mirror-utilize-job_exit-shim.patch [bz#1632939] +- kvm-jobs-utilize-job_exit-shim.patch [bz#1632939] +- kvm-block-backup-make-function-variables-consistently-na.patch [bz#1632939] +- kvm-jobs-remove-ret-argument-to-job_completed-privatize-.patch [bz#1632939] +- kvm-jobs-remove-job_defer_to_main_loop.patch [bz#1632939] +- kvm-block-commit-add-block-job-creation-flags.patch [bz#1632939] +- kvm-block-mirror-add-block-job-creation-flags.patch [bz#1632939] +- kvm-block-stream-add-block-job-creation-flags.patch [bz#1632939] +- kvm-block-commit-refactor-commit-to-use-job-callbacks.patch [bz#1632939] +- kvm-block-mirror-don-t-install-backing-chain-on-abort.patch [bz#1632939] +- kvm-block-mirror-conservative-mirror_exit-refactor.patch [bz#1632939] +- kvm-block-stream-refactor-stream-to-use-job-callbacks.patch [bz#1632939] +- kvm-tests-blockjob-replace-Blockjob-with-Job.patch [bz#1632939] +- kvm-tests-test-blockjob-remove-exit-callback.patch [bz#1632939] +- kvm-tests-test-blockjob-txn-move-.exit-to-.clean.patch [bz#1632939] +- kvm-jobs-remove-.exit-callback.patch [bz#1632939] +- kvm-qapi-block-commit-expose-new-job-properties.patch [bz#1632939] +- kvm-qapi-block-mirror-expose-new-job-properties.patch [bz#1632939] +- kvm-qapi-block-stream-expose-new-job-properties.patch [bz#1632939] +- kvm-block-backup-qapi-documentation-fixup.patch [bz#1632939] +- kvm-blockdev-document-transactional-shortcomings.patch [bz#1632939] +- Resolves: bz#1608765 + (After postcopy migration, do savevm and loadvm, guest hang and call trace) +- Resolves: bz#1632939 + (qemu blockjobs other than backup do not support job-finalize or job-dismiss) + +* Fri Sep 28 2018 Danilo Cesar Lemes de Paula - 2.12.0-32.el8 +- kvm-Re-enable-disabled-Hyper-V-enlightenments.patch [bz#1625185] +- kvm-Fix-annocheck-issues.patch [bz#1624164] +- kvm-exec-check-that-alignment-is-a-power-of-two.patch [bz#1630746] +- kvm-curl-Make-sslverify-off-disable-host-as-well-as-peer.patch [bz#1575925] +- Resolves: bz#1575925 + ("SSL: no alternative certificate subject name matches target host name" error even though sslverify = off) +- Resolves: bz#1624164 + (Review annocheck distro flag failures in qemu-kvm) +- Resolves: bz#1625185 + (Re-enable disabled Hyper-V enlightenments) +- Resolves: bz#1630746 + (qemu_ram_mmap: Assertion `is_power_of_2(align)' failed) + +* Tue Sep 11 2018 Danilo Cesar Lemes de Paula - 2.12.0-31.el8 +- kvm-i386-Disable-TOPOEXT-by-default-on-cpu-host.patch [bz#1619804] +- kvm-redhat-enable-opengl-add-build-and-runtime-deps.patch [bz#1618412] +- Resolves: bz#1618412 + (Enable opengl (for intel vgpu display)) +- Resolves: bz#1619804 + (kernel panic in init_amd_cacheinfo) + +* Wed Sep 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-30.el8 +- kvm-redhat-Disable-vhost-crypto.patch [bz#1625668] +- Resolves: bz#1625668 + (Decide if we should disable 'vhost-crypto' or not) + +* Wed Sep 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-29.el8 +- kvm-target-i386-sev-fix-memory-leaks.patch [bz#1615717] +- kvm-i386-Fix-arch_query_cpu_model_expansion-leak.patch [bz#1615717] +- kvm-redhat-Update-build-configuration.patch [bz#1573156] +- Resolves: bz#1573156 + (Update build configure for QEMU 2.12.0) +- Resolves: bz#1615717 + (Memory leaks) + +* Tue Sep 04 2018 Danilo Cesar Lemes de Paula - 2.12.0-28.el8 +- kvm-e1000e-Do-not-auto-clear-ICR-bits-which-aren-t-set-i.patch [bz#1596024] +- kvm-e1000e-Prevent-MSI-MSI-X-storms.patch [bz#1596024] +- kvm-Drop-build_configure.sh-and-Makefile.local-files.patch [] +- kvm-Fix-subject-line-in-.gitpublish.patch [] +- Resolves: bz#1596024 + (The network link can't be detected on guest when the guest uses e1000e model type) + +* Wed Aug 29 2018 Danilo Cesar Lemes de Paula - 2.12.0-27.el8 +- kvm-Fix-libusb-1.0.22-deprecated-libusb_set_debug-with-l.patch [bz#1622656] +- Resolves: bz#1622656 + (qemu-kvm fails to build due to libusb_set_debug being deprecated) + +* Fri Aug 17 2018 Danilo Cesar Lemes de Paula - 2.12.0-26.el8 +- kvm-redhat-remove-extra-in-rhel_rhev_conflicts-macro.patch [bz#1618752] +- Resolves: bz#1618752 + (qemu-kvm can't be installed in RHEL-8 as it Conflicts with itself.) + +* Thu Aug 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-25.el8 +- kvm-Migration-TLS-Fix-crash-due-to-double-cleanup.patch [bz#1594384] +- Resolves: bz#1594384 + (2.12 migration fixes) + +* Tue Aug 14 2018 Danilo Cesar Lemes de Paula - 2.12.0-24.el8 +- kvm-Add-qemu-keymap-to-qemu-kvm-common.patch [bz#1593117] +- Resolves: bz#1593117 + (add qemu-keymap utility) + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-23.el8 +- Fixing an issue with some old command in the spec file + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-22.el8 +- Fix an issue with the build_configure script. +- Resolves: bz#1425820 + (Improve QEMU packaging layout with modularization of the block layer) + + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-20.el8 +- kvm-migration-stop-compressing-page-in-migration-thread.patch [bz#1594384] +- kvm-migration-stop-compression-to-allocate-and-free-memo.patch [bz#1594384] +- kvm-migration-stop-decompression-to-allocate-and-free-me.patch [bz#1594384] +- kvm-migration-detect-compression-and-decompression-error.patch [bz#1594384] +- kvm-migration-introduce-control_save_page.patch [bz#1594384] +- kvm-migration-move-some-code-to-ram_save_host_page.patch [bz#1594384] +- kvm-migration-move-calling-control_save_page-to-the-comm.patch [bz#1594384] +- kvm-migration-move-calling-save_zero_page-to-the-common-.patch [bz#1594384] +- kvm-migration-introduce-save_normal_page.patch [bz#1594384] +- kvm-migration-remove-ram_save_compressed_page.patch [bz#1594384] +- kvm-migration-block-dirty-bitmap-fix-memory-leak-in-dirt.patch [bz#1594384] +- kvm-migration-fix-saving-normal-page-even-if-it-s-been-c.patch [bz#1594384] +- kvm-migration-update-index-field-when-delete-or-qsort-RD.patch [bz#1594384] +- kvm-migration-introduce-decompress-error-check.patch [bz#1594384] +- kvm-migration-Don-t-activate-block-devices-if-using-S.patch [bz#1594384] +- kvm-migration-not-wait-RDMA_CM_EVENT_DISCONNECTED-event-.patch [bz#1594384] +- kvm-migration-block-dirty-bitmap-fix-dirty_bitmap_load.patch [bz#1594384] +- kvm-s390x-add-RHEL-7.6-machine-type-for-ccw.patch [bz#1595718] +- kvm-s390x-cpumodel-default-enable-bpb-and-ppa15-for-z196.patch [bz#1595718] +- kvm-linux-headers-asm-s390-kvm.h-header-sync.patch [bz#1612938] +- kvm-s390x-kvm-add-etoken-facility.patch [bz#1612938] +- Resolves: bz#1594384 + (2.12 migration fixes) +- Resolves: bz#1595718 + (Add ppa15/bpb to the default cpu model for z196 and higher in the 7.6 s390-ccw-virtio machine) +- Resolves: bz#1612938 + (Add etoken support to qemu-kvm for s390x KVM guests) + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-18.el8 + Mass import from RHEL 7.6 qemu-kvm-rhev, including fixes to the following BZs: + +- kvm-AArch64-Add-virt-rhel7.6-machine-type.patch [bz#1558723] +- kvm-cpus-Fix-event-order-on-resume-of-stopped-guest.patch [bz#1566153] +- kvm-qemu-img-Check-post-truncation-size.patch [bz#1523065] +- kvm-vga-catch-depth-0.patch [bz#1575541] +- kvm-Fix-x-hv-max-vps-compat-value-for-7.4-machine-type.patch [bz#1583959] +- kvm-ccid-card-passthru-fix-regression-in-realize.patch [bz#1584984] +- kvm-Use-4-MB-vram-for-cirrus.patch [bz#1542080] +- kvm-spapr_pci-Remove-unhelpful-pagesize-warning.patch [bz#1505664] +- kvm-rpm-Add-nvme-VFIO-driver-to-rw-whitelist.patch [bz#1416180] +- kvm-qobject-Use-qobject_to-instead-of-type-cast.patch [bz#1557995] +- kvm-qobject-Ensure-base-is-at-offset-0.patch [bz#1557995] +- kvm-qobject-use-a-QObjectBase_-struct.patch [bz#1557995] +- kvm-qobject-Replace-qobject_incref-QINCREF-qobject_decre.patch [bz#1557995] +- kvm-qobject-Modify-qobject_ref-to-return-obj.patch [bz#1557995] +- kvm-rbd-Drop-deprecated-drive-parameter-filename.patch [bz#1557995] +- kvm-iscsi-Drop-deprecated-drive-parameter-filename.patch [bz#1557995] +- kvm-block-Add-block-specific-QDict-header.patch [bz#1557995] +- kvm-qobject-Move-block-specific-qdict-code-to-block-qdic.patch [bz#1557995] +- kvm-block-Fix-blockdev-for-certain-non-string-scalars.patch [bz#1557995] +- kvm-block-Fix-drive-for-certain-non-string-scalars.patch [bz#1557995] +- kvm-block-Clean-up-a-misuse-of-qobject_to-in-.bdrv_co_cr.patch [bz#1557995] +- kvm-block-Factor-out-qobject_input_visitor_new_flat_conf.patch [bz#1557995] +- kvm-block-Make-remaining-uses-of-qobject-input-visitor-m.patch [bz#1557995] +- kvm-block-qdict-Simplify-qdict_flatten_qdict.patch [bz#1557995] +- kvm-block-qdict-Tweak-qdict_flatten_qdict-qdict_flatten_.patch [bz#1557995] +- kvm-block-qdict-Clean-up-qdict_crumple-a-bit.patch [bz#1557995] +- kvm-block-qdict-Simplify-qdict_is_list-some.patch [bz#1557995] +- kvm-check-block-qdict-Rename-qdict_flatten-s-variables-f.patch [bz#1557995] +- kvm-check-block-qdict-Cover-flattening-of-empty-lists-an.patch [bz#1557995] +- kvm-block-Fix-blockdev-blockdev-add-for-empty-objects-an.patch [bz#1557995] +- kvm-rbd-New-parameter-auth-client-required.patch [bz#1557995] +- kvm-rbd-New-parameter-key-secret.patch [bz#1557995] +- kvm-block-mirror-honor-ratelimit-again.patch [bz#1572856] +- kvm-block-mirror-Make-cancel-always-cancel-pre-READY.patch [bz#1572856] +- kvm-iotests-Add-test-for-cancelling-a-mirror-job.patch [bz#1572856] +- kvm-iotests-Split-214-off-of-122.patch [bz#1518738] +- kvm-block-Add-COR-filter-driver.patch [bz#1518738] +- kvm-block-BLK_PERM_WRITE-includes-._UNCHANGED.patch [bz#1518738] +- kvm-block-Add-BDRV_REQ_WRITE_UNCHANGED-flag.patch [bz#1518738] +- kvm-block-Set-BDRV_REQ_WRITE_UNCHANGED-for-COR-writes.patch [bz#1518738] +- kvm-block-quorum-Support-BDRV_REQ_WRITE_UNCHANGED.patch [bz#1518738] +- kvm-block-Support-BDRV_REQ_WRITE_UNCHANGED-in-filters.patch [bz#1518738] +- kvm-iotests-Clean-up-wrap-image-in-197.patch [bz#1518738] +- kvm-iotests-Copy-197-for-COR-filter-driver.patch [bz#1518738] +- kvm-iotests-Add-test-for-COR-across-nodes.patch [bz#1518738] +- kvm-qemu-io-Use-purely-string-blockdev-options.patch [bz#1576598] +- kvm-qemu-img-Use-only-string-options-in-img_open_opts.patch [bz#1576598] +- kvm-iotests-Add-test-for-U-force-share-conflicts.patch [bz#1576598] +- kvm-qemu-io-Drop-command-functions-return-values.patch [bz#1519617] +- kvm-qemu-io-Let-command-functions-return-error-code.patch [bz#1519617] +- kvm-qemu-io-Exit-with-error-when-a-command-failed.patch [bz#1519617] +- kvm-iotests.py-Add-qemu_io_silent.patch [bz#1519617] +- kvm-iotests-Let-216-make-use-of-qemu-io-s-exit-code.patch [bz#1519617] +- kvm-qcow2-Repair-OFLAG_COPIED-when-fixing-leaks.patch [bz#1527085] +- kvm-iotests-Repairing-error-during-snapshot-deletion.patch [bz#1527085] +- kvm-block-Make-bdrv_is_writable-public.patch [bz#1588039] +- kvm-qcow2-Do-not-mark-inactive-images-corrupt.patch [bz#1588039] +- kvm-iotests-Add-case-for-a-corrupted-inactive-image.patch [bz#1588039] +- kvm-main-loop-drop-spin_counter.patch [bz#1168213] +- kvm-target-ppc-Factor-out-the-parsing-in-kvmppc_get_cpu_.patch [bz#1560847] +- kvm-target-ppc-Don-t-require-private-l1d-cache-on-POWER8.patch [bz#1560847] +- kvm-ppc-spapr_caps-Don-t-disable-cap_cfpc-on-POWER8-by-d.patch [bz#1560847] +- kvm-qxl-fix-local-renderer-crash.patch [bz#1567733] +- kvm-qemu-img-Amendment-support-implies-create_opts.patch [bz#1537956] +- kvm-block-Add-Error-parameter-to-bdrv_amend_options.patch [bz#1537956] +- kvm-qemu-option-Pull-out-Supported-options-print.patch [bz#1537956] +- kvm-qemu-img-Add-print_amend_option_help.patch [bz#1537956] +- kvm-qemu-img-Recognize-no-creation-support-in-o-help.patch [bz#1537956] +- kvm-iotests-Test-help-option-for-unsupporting-formats.patch [bz#1537956] +- kvm-iotests-Rework-113.patch [bz#1537956] +- kvm-qemu-img-Resolve-relative-backing-paths-in-rebase.patch [bz#1569835] +- kvm-iotests-Add-test-for-rebasing-with-relative-paths.patch [bz#1569835] +- kvm-qemu-img-Special-post-backing-convert-handling.patch [bz#1527898] +- kvm-iotests-Test-post-backing-convert-target-behavior.patch [bz#1527898] +- kvm-migration-calculate-expected_downtime-with-ram_bytes.patch [bz#1564576] +- kvm-sheepdog-Fix-sd_co_create_opts-memory-leaks.patch [bz#1513543] +- kvm-qemu-iotests-reduce-chance-of-races-in-185.patch [bz#1513543] +- kvm-blockjob-do-not-cancel-timer-in-resume.patch [bz#1513543] +- kvm-nfs-Fix-error-path-in-nfs_options_qdict_to_qapi.patch [bz#1513543] +- kvm-nfs-Remove-processed-options-from-QDict.patch [bz#1513543] +- kvm-blockjob-drop-block_job_pause-resume_all.patch [bz#1513543] +- kvm-blockjob-expose-error-string-via-query.patch [bz#1513543] +- kvm-blockjob-Fix-assertion-in-block_job_finalize.patch [bz#1513543] +- kvm-blockjob-Wrappers-for-progress-counter-access.patch [bz#1513543] +- kvm-blockjob-Move-RateLimit-to-BlockJob.patch [bz#1513543] +- kvm-blockjob-Implement-block_job_set_speed-centrally.patch [bz#1513543] +- kvm-blockjob-Introduce-block_job_ratelimit_get_delay.patch [bz#1513543] +- kvm-blockjob-Add-block_job_driver.patch [bz#1513543] +- kvm-blockjob-Update-block-job-pause-resume-documentation.patch [bz#1513543] +- kvm-blockjob-Improve-BlockJobInfo.offset-len-documentati.patch [bz#1513543] +- kvm-job-Create-Job-JobDriver-and-job_create.patch [bz#1513543] +- kvm-job-Rename-BlockJobType-into-JobType.patch [bz#1513543] +- kvm-job-Add-JobDriver.job_type.patch [bz#1513543] +- kvm-job-Add-job_delete.patch [bz#1513543] +- kvm-job-Maintain-a-list-of-all-jobs.patch [bz#1513543] +- kvm-job-Move-state-transitions-to-Job.patch [bz#1513543] +- kvm-job-Add-reference-counting.patch [bz#1513543] +- kvm-job-Move-cancelled-to-Job.patch [bz#1513543] +- kvm-job-Add-Job.aio_context.patch [bz#1513543] +- kvm-job-Move-defer_to_main_loop-to-Job.patch [bz#1513543] +- kvm-job-Move-coroutine-and-related-code-to-Job.patch [bz#1513543] +- kvm-job-Add-job_sleep_ns.patch [bz#1513543] +- kvm-job-Move-pause-resume-functions-to-Job.patch [bz#1513543] +- kvm-job-Replace-BlockJob.completed-with-job_is_completed.patch [bz#1513543] +- kvm-job-Move-BlockJobCreateFlags-to-Job.patch [bz#1513543] +- kvm-blockjob-Split-block_job_event_pending.patch [bz#1513543] +- kvm-job-Add-job_event_.patch [bz#1513543] +- kvm-job-Move-single-job-finalisation-to-Job.patch [bz#1513543] +- kvm-job-Convert-block_job_cancel_async-to-Job.patch [bz#1513543] +- kvm-job-Add-job_drain.patch [bz#1513543] +- kvm-job-Move-.complete-callback-to-Job.patch [bz#1513543] +- kvm-job-Move-job_finish_sync-to-Job.patch [bz#1513543] +- kvm-job-Switch-transactions-to-JobTxn.patch [bz#1513543] +- kvm-job-Move-transactions-to-Job.patch [bz#1513543] +- kvm-job-Move-completion-and-cancellation-to-Job.patch [bz#1513543] +- kvm-block-Cancel-job-in-bdrv_close_all-callers.patch [bz#1513543] +- kvm-job-Add-job_yield.patch [bz#1513543] +- kvm-job-Add-job_dismiss.patch [bz#1513543] +- kvm-job-Add-job_is_ready.patch [bz#1513543] +- kvm-job-Add-job_transition_to_ready.patch [bz#1513543] +- kvm-job-Move-progress-fields-to-Job.patch [bz#1513543] +- kvm-job-Introduce-qapi-job.json.patch [bz#1513543] +- kvm-job-Add-JOB_STATUS_CHANGE-QMP-event.patch [bz#1513543] +- kvm-job-Add-lifecycle-QMP-commands.patch [bz#1513543] +- kvm-job-Add-query-jobs-QMP-command.patch [bz#1513543] +- kvm-blockjob-Remove-BlockJob.driver.patch [bz#1513543] +- kvm-iotests-Move-qmp_to_opts-to-VM.patch [bz#1513543] +- kvm-qemu-iotests-Test-job-with-block-jobs.patch [bz#1513543] +- kvm-vdi-Fix-vdi_co_do_create-return-value.patch [bz#1513543] +- kvm-vhdx-Fix-vhdx_co_create-return-value.patch [bz#1513543] +- kvm-job-Add-error-message-for-failing-jobs.patch [bz#1513543] +- kvm-block-create-Make-x-blockdev-create-a-job.patch [bz#1513543] +- kvm-qemu-iotests-Add-VM.get_qmp_events_filtered.patch [bz#1513543] +- kvm-qemu-iotests-Add-VM.qmp_log.patch [bz#1513543] +- kvm-qemu-iotests-Add-iotests.img_info_log.patch [bz#1513543] +- kvm-qemu-iotests-Add-VM.run_job.patch [bz#1513543] +- kvm-qemu-iotests-iotests.py-helper-for-non-file-protocol.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-206-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-207-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-210-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-211-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-212-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-213-for-blockdev-create-job.patch [bz#1513543] +- kvm-block-create-Mark-blockdev-create-stable.patch [bz#1513543] +- kvm-jobs-fix-stale-wording.patch [bz#1513543] +- kvm-jobs-fix-verb-references-in-docs.patch [bz#1513543] +- kvm-iotests-Fix-219-s-timing.patch [bz#1513543] +- kvm-iotests-improve-pause_job.patch [bz#1513543] +- kvm-rpm-Whitelist-copy-on-read-block-driver.patch [bz#1518738] +- kvm-rpm-add-throttle-driver-to-rw-whitelist.patch [bz#1591076] +- kvm-usb-host-skip-open-on-pending-postload-bh.patch [bz#1572851] +- kvm-i386-Define-the-Virt-SSBD-MSR-and-handling-of-it-CVE.patch [bz#1574216] +- kvm-i386-define-the-AMD-virt-ssbd-CPUID-feature-bit-CVE-.patch [bz#1574216] +- kvm-block-file-posix-Pass-FD-to-locking-helpers.patch [bz#1519144] +- kvm-block-file-posix-File-locking-during-creation.patch [bz#1519144] +- kvm-iotests-Add-creation-test-to-153.patch [bz#1519144] +- kvm-vhost-user-add-Net-prefix-to-internal-state-structur.patch [bz#1526645] +- kvm-virtio-support-setting-memory-region-based-host-noti.patch [bz#1526645] +- kvm-vhost-user-support-receiving-file-descriptors-in-sla.patch [bz#1526645] +- kvm-osdep-add-wait.h-compat-macros.patch [bz#1526645] +- kvm-vhost-user-bridge-support-host-notifier.patch [bz#1526645] +- kvm-vhost-allow-backends-to-filter-memory-sections.patch [bz#1526645] +- kvm-vhost-user-allow-slave-to-send-fds-via-slave-channel.patch [bz#1526645] +- kvm-vhost-user-introduce-shared-vhost-user-state.patch [bz#1526645] +- kvm-vhost-user-support-registering-external-host-notifie.patch [bz#1526645] +- kvm-libvhost-user-support-host-notifier.patch [bz#1526645] +- kvm-block-Introduce-API-for-copy-offloading.patch [bz#1482537] +- kvm-raw-Check-byte-range-uniformly.patch [bz#1482537] +- kvm-raw-Implement-copy-offloading.patch [bz#1482537] +- kvm-qcow2-Implement-copy-offloading.patch [bz#1482537] +- kvm-file-posix-Implement-bdrv_co_copy_range.patch [bz#1482537] +- kvm-iscsi-Query-and-save-device-designator-when-opening.patch [bz#1482537] +- kvm-iscsi-Create-and-use-iscsi_co_wait_for_task.patch [bz#1482537] +- kvm-iscsi-Implement-copy-offloading.patch [bz#1482537] +- kvm-block-backend-Add-blk_co_copy_range.patch [bz#1482537] +- kvm-qemu-img-Convert-with-copy-offloading.patch [bz#1482537] +- kvm-qcow2-Fix-src_offset-in-copy-offloading.patch [bz#1482537] +- kvm-iscsi-Don-t-blindly-use-designator-length-in-respons.patch [bz#1482537] +- kvm-file-posix-Fix-EINTR-handling.patch [bz#1482537] +- kvm-usb-storage-Add-rerror-werror-properties.patch [bz#1595180] +- kvm-numa-clarify-error-message-when-node-index-is-out-of.patch [bz#1578381] +- kvm-qemu-iotests-Update-026.out.nocache-reference-output.patch [bz#1528541] +- kvm-qcow2-Free-allocated-clusters-on-write-error.patch [bz#1528541] +- kvm-qemu-iotests-Test-qcow2-not-leaking-clusters-on-writ.patch [bz#1528541] +- kvm-qemu-options-Add-missing-newline-to-accel-help-text.patch [bz#1586313] +- kvm-xhci-fix-guest-triggerable-assert.patch [bz#1594135] +- kvm-virtio-gpu-tweak-scanout-disable.patch [bz#1589634] +- kvm-virtio-gpu-update-old-resource-too.patch [bz#1589634] +- kvm-virtio-gpu-disable-scanout-when-backing-resource-is-.patch [bz#1589634] +- kvm-block-Don-t-silently-truncate-node-names.patch [bz#1549654] +- kvm-pr-helper-fix-socket-path-default-in-help.patch [bz#1533158] +- kvm-pr-helper-fix-assertion-failure-on-failed-multipath-.patch [bz#1533158] +- kvm-pr-manager-helper-avoid-SIGSEGV-when-writing-to-the-.patch [bz#1533158] +- kvm-pr-manager-put-stubs-in-.c-file.patch [bz#1533158] +- kvm-pr-manager-add-query-pr-managers-QMP-command.patch [bz#1533158] +- kvm-pr-manager-helper-report-event-on-connection-disconn.patch [bz#1533158] +- kvm-pr-helper-avoid-error-on-PR-IN-command-with-zero-req.patch [bz#1533158] +- kvm-pr-helper-Rework-socket-path-handling.patch [bz#1533158] +- kvm-pr-manager-helper-fix-memory-leak-on-event.patch [bz#1533158] +- kvm-object-fix-OBJ_PROP_LINK_UNREF_ON_RELEASE-ambivalenc.patch [bz#1556678] +- kvm-usb-hcd-xhci-test-add-a-test-for-ccid-hotplug.patch [bz#1556678] +- kvm-Revert-usb-release-the-created-buses.patch [bz#1556678] +- kvm-file-posix-Fix-creation-locking.patch [bz#1599335] +- kvm-file-posix-Unlock-FD-after-creation.patch [bz#1599335] +- kvm-ahci-trim-signatures-on-raise-lower.patch [bz#1584914] +- kvm-ahci-fix-PxCI-register-race.patch [bz#1584914] +- kvm-ahci-don-t-schedule-unnecessary-BH.patch [bz#1584914] +- kvm-qcow2-Fix-qcow2_truncate-error-return-value.patch [bz#1595173] +- kvm-block-Convert-.bdrv_truncate-callback-to-coroutine_f.patch [bz#1595173] +- kvm-qcow2-Remove-coroutine-trampoline-for-preallocate_co.patch [bz#1595173] +- kvm-block-Move-bdrv_truncate-implementation-to-io.c.patch [bz#1595173] +- kvm-block-Use-tracked-request-for-truncate.patch [bz#1595173] +- kvm-file-posix-Make-.bdrv_co_truncate-asynchronous.patch [bz#1595173] +- kvm-block-Fix-copy-on-read-crash-with-partial-final-clus.patch [bz#1590640] +- kvm-block-fix-QEMU-crash-with-scsi-hd-and-drive_del.patch [bz#1599515] +- kvm-virtio-rng-process-pending-requests-on-DRIVER_OK.patch [bz#1576743] +- kvm-file-posix-specify-expected-filetypes.patch [bz#1525829] +- kvm-iotests-add-test-226-for-file-driver-types.patch [bz#1525829] +- kvm-block-dirty-bitmap-add-lock-to-bdrv_enable-disable_d.patch [bz#1207657] +- kvm-qapi-add-x-block-dirty-bitmap-enable-disable.patch [bz#1207657] +- kvm-qmp-transaction-support-for-x-block-dirty-bitmap-ena.patch [bz#1207657] +- kvm-qapi-add-x-block-dirty-bitmap-merge.patch [bz#1207657] +- kvm-qapi-add-disabled-parameter-to-block-dirty-bitmap-ad.patch [bz#1207657] +- kvm-block-dirty-bitmap-add-bdrv_enable_dirty_bitmap_lock.patch [bz#1207657] +- kvm-dirty-bitmap-fix-double-lock-on-bitmap-enabling.patch [bz#1207657] +- kvm-block-qcow2-bitmap-fix-free_bitmap_clusters.patch [bz#1207657] +- kvm-qcow2-add-overlap-check-for-bitmap-directory.patch [bz#1207657] +- kvm-blockdev-enable-non-root-nodes-for-backup-source.patch [bz#1207657] +- kvm-iotests-add-222-to-test-basic-fleecing.patch [bz#1207657] +- kvm-qcow2-Remove-dead-check-on-ret.patch [bz#1207657] +- kvm-block-Move-request-tracking-to-children-in-copy-offl.patch [bz#1207657] +- kvm-block-Fix-parameter-checking-in-bdrv_co_copy_range_i.patch [bz#1207657] +- kvm-block-Honour-BDRV_REQ_NO_SERIALISING-in-copy-range.patch [bz#1207657] +- kvm-backup-Use-copy-offloading.patch [bz#1207657] +- kvm-block-backup-disable-copy-offloading-for-backup.patch [bz#1207657] +- kvm-iotests-222-Don-t-run-with-luks.patch [bz#1207657] +- kvm-block-io-fix-copy_range.patch [bz#1207657] +- kvm-block-split-flags-in-copy_range.patch [bz#1207657] +- kvm-block-add-BDRV_REQ_SERIALISING-flag.patch [bz#1207657] +- kvm-block-backup-fix-fleecing-scheme-use-serialized-writ.patch [bz#1207657] +- kvm-nbd-server-Reject-0-length-block-status-request.patch [bz#1207657] +- kvm-nbd-server-fix-trace.patch [bz#1207657] +- kvm-nbd-server-refactor-NBDExportMetaContexts.patch [bz#1207657] +- kvm-nbd-server-add-nbd_meta_empty_or_pattern-helper.patch [bz#1207657] +- kvm-nbd-server-implement-dirty-bitmap-export.patch [bz#1207657] +- kvm-qapi-new-qmp-command-nbd-server-add-bitmap.patch [bz#1207657] +- kvm-docs-interop-add-nbd.txt.patch [bz#1207657] +- kvm-nbd-server-introduce-NBD_CMD_CACHE.patch [bz#1207657] +- kvm-nbd-server-Silence-gcc-false-positive.patch [bz#1207657] +- kvm-nbd-server-Fix-dirty-bitmap-logic-regression.patch [bz#1207657] +- kvm-nbd-server-fix-nbd_co_send_block_status.patch [bz#1207657] +- kvm-nbd-client-Add-x-dirty-bitmap-to-query-bitmap-from-s.patch [bz#1207657] +- kvm-iotests-New-test-223-for-exporting-dirty-bitmap-over.patch [bz#1207657] +- kvm-hw-char-serial-Only-retry-if-qemu_chr_fe_write-retur.patch [bz#1592817] +- kvm-hw-char-serial-retry-write-if-EAGAIN.patch [bz#1592817] +- kvm-throttle-groups-fix-hang-when-group-member-leaves.patch [bz#1535914] +- kvm-Disable-aarch64-devices-reappeared-after-2.12-rebase.patch [bz#1586357] +- kvm-Disable-split-irq-device.patch [bz#1586357] +- kvm-Disable-AT24Cx-i2c-eeprom.patch [bz#1586357] +- kvm-Disable-CAN-bus-devices.patch [bz#1586357] +- kvm-Disable-new-superio-devices.patch [bz#1586357] +- kvm-Disable-new-pvrdma-device.patch [bz#1586357] +- kvm-qdev-add-HotplugHandler-post_plug-callback.patch [bz#1607891] +- kvm-virtio-scsi-fix-hotplug-reset-vs-event-race.patch [bz#1607891] +- kvm-e1000-Fix-tso_props-compat-for-82540em.patch [bz#1608778] +- kvm-slirp-correct-size-computation-while-concatenating-m.patch [bz#1586255] +- kvm-s390x-sclp-fix-maxram-calculation.patch [bz#1595740] +- kvm-redhat-Make-gitpublish-profile-the-default-one.patch [bz#1425820] +- Resolves: bz#1168213 + (main-loop: WARNING: I/O thread spun for 1000 iterations while doing stream block device.) +- Resolves: bz#1207657 + (RFE: QEMU Incremental live backup - push and pull modes) +- Resolves: bz#1416180 + (QEMU VFIO based block driver for NVMe devices) +- Resolves: bz#1425820 + (Improve QEMU packaging layout with modularization of the block layer) +- Resolves: bz#1482537 + ([RFE] qemu-img copy-offloading (convert command)) +- Resolves: bz#1505664 + ("qemu-kvm: System page size 0x1000000 is not enabled in page_size_mask (0x11000). Performance may be slow" show up while using hugepage as guest's memory) +- Resolves: bz#1513543 + ([RFE] Add block job to create format on a storage device) +- Resolves: bz#1518738 + (Add 'copy-on-read' filter driver for use with blockdev-add) +- Resolves: bz#1519144 + (qemu-img: image locking doesn't cover image creation) +- Resolves: bz#1519617 + (The exit code should be non-zero when qemu-io reports an error) +- Resolves: bz#1523065 + ("qemu-img resize" should fail to decrease the size of logical partition/lvm/iSCSI image with raw format) +- Resolves: bz#1525829 + (can not boot up a scsi-block passthrough disk via -blockdev with error "cannot get SG_IO version number: Operation not supported. Is this a SCSI device?") +- Resolves: bz#1526645 + ([Intel 7.6 FEAT] vHost Data Plane Acceleration (vDPA) - vhost user client - qemu-kvm-rhev) +- Resolves: bz#1527085 + (The copied flag should be updated during '-r leaks') +- Resolves: bz#1527898 + ([RFE] qemu-img should leave cluster unallocated if it's read as zero throughout the backing chain) +- Resolves: bz#1528541 + (qemu-img check reports tons of leaked clusters after re-start nfs service to resume writing data in guest) +- Resolves: bz#1533158 + (QEMU support for libvirtd restarting qemu-pr-helper) +- Resolves: bz#1535914 + (Disable io throttling for one member disk of a group during io will induce the other one hang with io) +- Resolves: bz#1537956 + (RFE: qemu-img amend should list the true supported options) +- Resolves: bz#1542080 + (Qemu core dump at cirrus_invalidate_region) +- Resolves: bz#1549654 + (Reject node-names which would be truncated by the block layer commands) +- Resolves: bz#1556678 + (Hot plug usb-ccid for the 2nd time with the same ID as the 1st time failed) +- Resolves: bz#1557995 + (QAPI schema for RBD storage misses the 'password-secret' option) +- Resolves: bz#1558723 + (Create RHEL-7.6 QEMU machine type for AArch64) +- Resolves: bz#1560847 + ([Power8][FW b0320a_1812.861][rhel7.5rc2 3.10.0-861.el7.ppc64le][qemu-kvm-{ma,rhev}-2.10.0-21.el7_5.1.ppc64le] KVM guest does not default to ori type flush even with pseries-rhel7.5.0-sxxm) +- Resolves: bz#1564576 + (Pegas 1.1 - Require to backport qemu-kvm patch that fixes expected_downtime calculation during migration) +- Resolves: bz#1566153 + (IOERROR pause code lost after resuming a VM while I/O error is still present) +- Resolves: bz#1567733 + (qemu abort when migrate during guest reboot) +- Resolves: bz#1569835 + (qemu-img get wrong backing file path after rebasing image with relative path) +- Resolves: bz#1572851 + (Core dumped after migration when with usb-host) +- Resolves: bz#1572856 + ('block-job-cancel' can not cancel a "drive-mirror" job) +- Resolves: bz#1574216 + (CVE-2018-3639 qemu-kvm-rhev: hw: cpu: speculative store bypass [rhel-7.6]) +- Resolves: bz#1575541 + (qemu core dump while installing win10 guest) +- Resolves: bz#1576598 + (Segfault in qemu-io and qemu-img with -U --image-opts force-share=off) +- Resolves: bz#1576743 + (virtio-rng hangs when running on recent (2.x) QEMU versions) +- Resolves: bz#1578381 + (Error message need update when specify numa distance with node index >=128) +- Resolves: bz#1583959 + (Incorrect vcpu count limit for 7.4 machine types for windows guests) +- Resolves: bz#1584914 + (SATA emulator lags and hangs) +- Resolves: bz#1584984 + (Vm starts failed with 'passthrough' smartcard) +- Resolves: bz#1586255 + (CVE-2018-11806 qemu-kvm-rhev: QEMU: slirp: heap buffer overflow while reassembling fragmented datagrams [rhel-7.6]) +- Resolves: bz#1586313 + (-smp option is not easily found in the output of qemu help) +- Resolves: bz#1586357 + (Disable new devices in 2.12) +- Resolves: bz#1588039 + (Possible assertion failure in qemu when a corrupted image is used during an incoming migration) +- Resolves: bz#1589634 + (Migration failed when rebooting guest with multiple virtio videos) +- Resolves: bz#1590640 + (qemu-kvm: block/io.c:1098: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.) +- Resolves: bz#1591076 + (The driver of 'throttle' is not whitelisted) +- Resolves: bz#1592817 + (Retrying on serial_xmit if the pipe is broken may compromise the Guest) +- Resolves: bz#1594135 + (system_reset many times linux guests cause qemu process Aborted) +- Resolves: bz#1595173 + (blockdev-create is blocking) +- Resolves: bz#1595180 + (Can't set rerror/werror with usb-storage) +- Resolves: bz#1595740 + (RHEL-Alt-7.6 - qemu has error during migration of larger guests) +- Resolves: bz#1599335 + (Image creation locking is too tight and is not properly released) +- Resolves: bz#1599515 + (qemu core-dump with aio_read via hmp (util/qemu-thread-posix.c:64: qemu_mutex_lock_impl: Assertion `mutex->initialized' failed)) +- Resolves: bz#1607891 + (Hotplug events are sometimes lost with virtio-scsi + iothread) +- Resolves: bz#1608778 + (qemu/migration: migrate failed from RHEL.7.6 to RHEL.7.5 with e1000-82540em) + +* Mon Aug 06 2018 Danilo Cesar Lemes de Paula - 2.12.0-17.el8 +- kvm-linux-headers-Update-to-include-KVM_CAP_S390_HPAGE_1.patch [bz#1610906] +- kvm-s390x-Enable-KVM-huge-page-backing-support.patch [bz#1610906] +- kvm-redhat-s390x-add-hpage-1-to-kvm.conf.patch [bz#1610906] +- Resolves: bz#1610906 + ([IBM 8.0 FEAT] KVM: Huge Pages - libhugetlbfs Enablement - qemu-kvm part) + +* Tue Jul 31 2018 Danilo Cesar Lemes de Paula - 2.12.0-16.el8 +- kvm-spapr-Correct-inverted-test-in-spapr_pc_dimm_node.patch [bz#1601671] +- kvm-osdep-powerpc64-align-memory-to-allow-2MB-radix-THP-.patch [bz#1601317] +- kvm-RHEL-8.0-Add-pseries-rhel7.6.0-sxxm-machine-type.patch [bz#1595501] +- kvm-i386-Helpers-to-encode-cache-information-consistentl.patch [bz#1597739] +- kvm-i386-Add-cache-information-in-X86CPUDefinition.patch [bz#1597739] +- kvm-i386-Initialize-cache-information-for-EPYC-family-pr.patch [bz#1597739] +- kvm-i386-Add-new-property-to-control-cache-info.patch [bz#1597739] +- kvm-i386-Clean-up-cache-CPUID-code.patch [bz#1597739] +- kvm-i386-Populate-AMD-Processor-Cache-Information-for-cp.patch [bz#1597739] +- kvm-i386-Add-support-for-CPUID_8000_001E-for-AMD.patch [bz#1597739] +- kvm-i386-Fix-up-the-Node-id-for-CPUID_8000_001E.patch [bz#1597739] +- kvm-i386-Enable-TOPOEXT-feature-on-AMD-EPYC-CPU.patch [bz#1597739] +- kvm-i386-Remove-generic-SMT-thread-check.patch [bz#1597739] +- kvm-i386-Allow-TOPOEXT-to-be-enabled-on-older-kernels.patch [bz#1597739] +- Resolves: bz#1595501 + (Create pseries-rhel7.6.0-sxxm machine type) +- Resolves: bz#1597739 + (AMD EPYC/Zen SMT support for KVM / QEMU guest (qemu-kvm)) +- Resolves: bz#1601317 + (RHEL8.0 - qemu patch to align memory to allow 2MB THP) +- Resolves: bz#1601671 + (After rebooting guest,all the hot plug memory will be assigned to the 1st numa node.) + +* Tue Jul 24 2018 Danilo Cesar Lemes de Paula - 2.12.0-15.el8 +- kvm-spapr-Add-ibm-max-associativity-domains-property.patch [bz#1599593] +- kvm-Revert-spapr-Don-t-allow-memory-hotplug-to-memory-le.patch [bz#1599593] +- kvm-simpletrace-Convert-name-from-mapping-record-to-str.patch [bz#1594969] +- kvm-tests-fix-TLS-handshake-failure-with-TLS-1.3.patch [bz#1602403] +- Resolves: bz#1594969 + (simpletrace.py fails when running with Python 3) +- Resolves: bz#1599593 + (User can't hotplug memory to less memory numa node on rhel8) +- Resolves: bz#1602403 + (test-crypto-tlssession unit test fails with assertions) + +* Mon Jul 09 2018 Danilo Cesar Lemes de Paula - 2.12.0-14.el8 +- kvm-vfio-pci-Default-display-option-to-off.patch [bz#1590511] +- kvm-python-futurize-f-libfuturize.fixes.fix_print_with_i.patch [bz#1571533] +- kvm-python-futurize-f-lib2to3.fixes.fix_except.patch [bz#1571533] +- kvm-Revert-Defining-a-shebang-for-python-scripts.patch [bz#1571533] +- kvm-spec-Fix-ambiguous-python-interpreter-name.patch [bz#1571533] +- kvm-qemu-ga-blacklisting-guest-exec-and-guest-exec-statu.patch [bz#1518132] +- kvm-redhat-rewrap-build_configure.sh-cmdline-for-the-rh-.patch +- kvm-redhat-remove-the-VTD-LIVE_BLOCK_OPS-and-RHV-options.patch +- kvm-redhat-fix-the-rh-env-prep-target-s-dependency-on-th.patch +- kvm-redhat-remove-dead-code-related-to-s390-not-s390x.patch +- kvm-redhat-sync-compiler-flags-from-the-spec-file-to-rh-.patch +- kvm-redhat-sync-guest-agent-enablement-and-tcmalloc-usag.patch +- kvm-redhat-fix-up-Python-3-dependency-for-building-QEMU.patch +- kvm-redhat-fix-up-Python-dependency-for-SRPM-generation.patch +- kvm-redhat-disable-glusterfs-dependency-support-temporar.patch +- Resolves: bz#1518132 + (Ensure file access RPCs are disabled by default) +- Resolves: bz#1571533 + (Convert qemu-kvm python scripts to python3) +- Resolves: bz#1590511 + (Fails to start guest with Intel vGPU device) + +* Thu Jun 21 2018 Danilo C. L. de Paula - 2.12.0-13.el8 +- Resolves: bz#1508137 + ([IBM 8.0 FEAT] KVM: Interactive Bootloader (qemu)) +- Resolves: bz#1513558 + (Remove RHEL6 machine types) +- Resolves: bz#1568600 + (pc-i440fx-rhel7.6.0 and pc-q35-rhel7.6.0 machine types (x86)) +- Resolves: bz#1570029 + ([IBM 8.0 FEAT] KVM: 3270 Connectivity - qemu part) +- Resolves: bz#1578855 + (Enable Native Ceph support on non x86_64 CPUs) +- Resolves: bz#1585651 + (RHEL 7.6 new pseries machine type (ppc64le)) +- Resolves: bz#1592337 + ([IBM 8.0 FEAT] KVM: CPU Model z14 ZR1 (qemu-kvm)) + +* Tue May 15 2018 Danilo C. L. de Paula - 2.12.0-11.el8.1 +- Resolves: bz#1576468 + (Enable vhost_user in qemu-kvm 2.12) + +* Wed May 09 2018 Danilo de Paula - 2.12.0-11.el8 +- Resolves: bz#1574406 + ([RHEL 8][qemu-kvm] Failed to find romfile "efi-virtio.rom") +- Resolves: bz#1569675 + (Backwards compatibility of pc-*-rhel7.5.0 and older machine-types) +- Resolves: bz#1576045 + (Fix build issue by using python3) +- Resolves: bz#1571145 + (qemu-kvm segfaults on RHEL 8 when run guestfsd under TCG) + +* Fri Apr 20 2018 Danilo de Paula - 2.12.0-10.el +- Fixing some issues with packaging. +- Rebasing to 2.12.0-rc4 + +* Fri Apr 13 2018 Danilo de Paula - 2.11.0-7.el8 +- Bumping epoch for RHEL8 and dropping self-obsoleting + +* Thu Apr 12 2018 Danilo de Paula - 2.11.0-6.el8 +- Rebuilding + +* Mon Mar 05 2018 Danilo de Paula - 2.11.0-5.el8 +- Prepare building on RHEL-8.0 diff --git a/qemu-pr-helper.service b/qemu-pr-helper.service new file mode 100755 index 0000000..a1d27b0 --- /dev/null +++ b/qemu-pr-helper.service @@ -0,0 +1,15 @@ +[Unit] +Description=Persistent Reservation Daemon for QEMU + +[Service] +WorkingDirectory=/tmp +Type=simple +ExecStart=/usr/bin/qemu-pr-helper +PrivateTmp=yes +ProtectSystem=strict +ReadWritePaths=/var/run +RestrictAddressFamilies=AF_UNIX +Restart=always +RestartSec=0 + +[Install] diff --git a/qemu-pr-helper.socket b/qemu-pr-helper.socket new file mode 100755 index 0000000..9d7c3e5 --- /dev/null +++ b/qemu-pr-helper.socket @@ -0,0 +1,9 @@ +[Unit] +Description=Persistent Reservation Daemon for QEMU + +[Socket] +ListenStream=/run/qemu-pr-helper.sock +SocketMode=0600 + +[Install] +WantedBy=multi-user.target diff --git a/udev-kvm-check.c b/udev-kvm-check.c new file mode 100755 index 0000000..928b9de --- /dev/null +++ b/udev-kvm-check.c @@ -0,0 +1,155 @@ +/* + * udev-kvm-check.c + * + * Copyright 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include + +#define DEFAULT 0 +#define FACILITY "kvm" +#define SYSCONFIG_KVM "/etc/sysconfig/kvm" + +#define COUNT_MSG \ + "%d %s now active" + +int get_threshold_from_file(FILE *fp) +{ + static const char key[] = "THRESHOLD="; + int pos = 0; + int thres; + int ch; + +start: + /* State START - at beginning of line, search for beginning of "THRESHOLD=" + * string. + */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (isspace(ch)) { + goto start; + } + if (ch == 'T') { + pos = 1; + goto key; + } + goto eol; + +eol: + /* State EOL - loop until end of line */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (ch == '\n') { + goto start; + } + goto eol; + +key: + /* State KEY - match "THRESHOLD=" string, go to THRESHOLD if found */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (ch == key[pos]) { + pos++; + if (key[pos] == 0) { + goto threshold; + } else { + goto key; + } + } + goto eol; + +threshold: + /* State THRESHOLD - parse number using fscanf, expect comment or space + * or EOL. + */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (!isdigit(ch)) { + goto eol; + } + ungetc(ch, fp); + if (fscanf(fp, "%d", &thres) != 1) { + return DEFAULT; + } + ch = getc(fp); + if (ch == '#' || ch == EOF || ch == '\n' || isspace(ch)) { + return thres; + } + goto eol; +} + +int get_threshold() +{ + FILE *fp = fopen(SYSCONFIG_KVM, "r"); + int val; + + if (!fp) { + return DEFAULT; + } + + val = get_threshold_from_file(fp); + fclose (fp); + return val; +} + +const char *guest(int count) +{ + return (count == 1 ? "guest" : "guests"); +} + +void emit_count_message(int count) +{ + openlog(FACILITY, LOG_CONS, LOG_USER); + syslog(LOG_INFO, COUNT_MSG, count, guest(count)); + closelog(); +} + +int main(int argc, char **argv) +{ + int count, threshold; + + if (argc < 3) + exit(1); + + count = atoi(argv[1]); + threshold = get_threshold(); + + if (!strcmp(argv[2], "create")) { + if (threshold == 0 || count > threshold) { + emit_count_message(count); + } + } else { + if (count >= threshold) { + emit_count_message(count); + } + } + + return 0; +} diff --git a/vhost.conf b/vhost.conf new file mode 100755 index 0000000..68d6d7f --- /dev/null +++ b/vhost.conf @@ -0,0 +1,3 @@ +# Increase default vhost memory map limit to match +# KVM's memory slot limit +options vhost max_mem_regions=509 -- Gitee