diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 5533b025ec8c0ae48fbbb14077279a1700b92e08..90fe7d3dcd9be70df9f5c899572f3e24779438ff 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -1193,14 +1193,17 @@ static int idxd_wq_load_config(struct idxd_wq *wq) wq->size = wq->wqcfg->wq_size; wq->threshold = wq->wqcfg->wq_thresh; - /* The driver does not support shared WQ mode in read-only config yet */ - if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en) - return -EOPNOTSUPP; - - set_bit(WQ_FLAG_DEDICATED, &wq->flags); + if (wq->wqcfg->mode) + set_bit(WQ_FLAG_DEDICATED, &wq->flags); wq->priority = wq->wqcfg->priority; + if (wq->wqcfg->bof) + set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); + + if (wq->wqcfg->mode_support) + set_bit(WQ_FLAG_MODE_1, &wq->flags); + wq->max_xfer_bytes = 1ULL << wq->wqcfg->max_xfer_shift; wq->max_batch_size = 1ULL << wq->wqcfg->max_batch_shift; @@ -1451,26 +1454,27 @@ int __drv_enable_wq(struct idxd_wq *wq) } /* - * In the event that the WQ is configurable for pasid and priv bits. - * For kernel wq, the driver should setup the pasid, pasid_en, and priv bit. - * However, for non-kernel wq, the driver should only set the pasid_en bit for - * shared wq. A dedicated wq that is not 'kernel' type will configure pasid and - * pasid_en later on so there is no need to setup. - */ - if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { - int priv = 0; - - if (device_pasid_enabled(idxd)) { - if (is_idxd_wq_kernel(wq) || wq_shared(wq)) { + * In the event that the WQ is configurable for pasid and priv bits. + * For kernel wq, the driver should setup the pasid, pasid_en, and priv bit. + * However, for non-kernel wq, the driver should only set the pasid_en bit for + * shared wq. A dedicated wq will configure pasid and pasid_en later on so + * there is no need to setup. + */ + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags) || + test_bit(WQ_FLAG_MODE_1, &wq->flags)) { + if (is_idxd_wq_kernel(wq)) { + if (device_pasid_enabled(idxd)) { u32 pasid = wq_dedicated(wq) ? idxd->pasid : 0; __idxd_wq_set_pasid_locked(wq, pasid); } + __idxd_wq_set_priv_locked(wq, 1); + } else { + if (device_user_pasid_enabled(idxd) && wq_shared(wq)) + __idxd_wq_set_pasid_locked(wq, 0); + __idxd_wq_set_priv_locked(wq, 0); } - if (is_idxd_wq_kernel(wq)) - priv = 1; - __idxd_wq_set_priv_locked(wq, priv); } rc = 0; diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 098cc46010e7b1a4ae6ee60c1a9e836835bb6281..0dc1605789fcce4ab056ffae00b5ee911bbdb286 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -142,6 +142,7 @@ enum idxd_wq_state { enum idxd_wq_flag { WQ_FLAG_DEDICATED = 0, WQ_FLAG_BLOCK_ON_FAULT, + WQ_FLAG_MODE_1, }; enum idxd_wq_type { diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 9f16146951cc4062c8c1b3d9ed2bc091258ed296..ccd498d8ca965c8a6002ae481f8fb8939633d10f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -51,15 +51,6 @@ struct iommu_dma_cookie { struct iommu_domain *fq_domain; }; -void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, - struct iommu_domain *domain) -{ - struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iova_domain *iovad = &cookie->iovad; - - free_cpu_cached_iovas(cpu, iovad); -} - static void iommu_dma_entry_dtor(unsigned long data) { struct page *freelist = (struct page *)data; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 619e1d0eb5769887c3928cb3af9b35bcd52e1a6a..ab653181d74bddb480a13cc6af0cd1bb441916e9 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4279,35 +4279,6 @@ static struct notifier_block intel_iommu_memory_nb = { .priority = 0 }; -static void free_all_cpu_cached_iovas(unsigned int cpu) -{ - int i; - - for (i = 0; i < g_num_of_iommus; i++) { - struct intel_iommu *iommu = g_iommus[i]; - struct dmar_domain *domain; - int did; - - if (!iommu) - continue; - - for (did = 0; did < cap_ndoms(iommu->cap); did++) { - domain = get_iommu_domain(iommu, (u16)did); - - if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) - continue; - - iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain); - } - } -} - -static int intel_iommu_cpu_dead(unsigned int cpu) -{ - free_all_cpu_cached_iovas(cpu); - return 0; -} - static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; @@ -4602,8 +4573,6 @@ int __init intel_iommu_init(void) bus_set_iommu(&pci_bus_type, &intel_iommu_ops); if (si_domain && !hw_pass_through) register_memory_notifier(&intel_iommu_memory_nb); - cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, - intel_iommu_cpu_dead); down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) @@ -4672,12 +4641,12 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { if (dev_is_pci(info->dev) && sm_supported(iommu)) { intel_pasid_tear_down_entry(iommu, info->dev, - PASID_RID2PASID, false); + PASID_RID2PASID, false, false); pasid = iommu_get_pasid_from_domain(info->dev, &info->domain->domain); if (pasid != INVALID_IOASID) intel_pasid_tear_down_entry(iommu, info->dev, - pasid, false); + pasid, false, false); } iommu_disable_dev_iotlb(info); @@ -4777,8 +4746,8 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) * Check whether a @domain could be attached to the @dev through the * aux-domain attach/detach APIs. */ -static inline bool -is_aux_domain(struct device *dev, struct iommu_domain *domain) +inline bool is_aux_domain(struct device *dev, + struct iommu_domain *domain) { struct device_domain_info *info = get_domain_info(dev); @@ -4938,7 +4907,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, if (!auxiliary_unlink_device(domain, dev)) { spin_lock(&iommu->lock); intel_pasid_tear_down_entry(iommu, dev, - domain->default_pasid, false); + domain->default_pasid, false, false); domain_detach_iommu(domain, iommu); spin_unlock(&iommu->lock); } @@ -5108,6 +5077,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, u16 did, sid; int ret = 0; u64 size = 0; + bool default_pasid = false; if (!inv_info || !dmar_domain) return -EINVAL; @@ -5155,14 +5125,29 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, * PASID is stored in different locations based on the * granularity. */ - if (inv_info->granularity == IOMMU_INV_GRANU_PASID && - (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) - pasid = inv_info->granu.pasid_info.pasid; - else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && - (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) - pasid = inv_info->granu.addr_info.pasid; - - ret = ioasid_get_if_owned(pasid); + if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { + if (inv_info->granu.pasid_info.flags & + IOMMU_INV_PASID_FLAGS_PASID) { + pasid = inv_info->granu.pasid_info.pasid; + } else { + pasid = domain_get_pasid(domain, dev); + default_pasid = true; + } + } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { + if (inv_info->granu.addr_info.flags & + IOMMU_INV_ADDR_FLAGS_PASID) { + pasid = inv_info->granu.addr_info.pasid; + } else { + pasid = domain_get_pasid(domain, dev); + default_pasid = true; + } + } + + if (default_pasid) + ret = ioasid_get(NULL, pasid); + else + ret = ioasid_get_if_owned(pasid); + if (ret) goto out_unlock; @@ -5862,8 +5847,13 @@ static int intel_iommu_get_nesting_info(struct iommu_domain *domain, info->addr_width = dmar_domain->gaw; info->format = IOMMU_PASID_FORMAT_INTEL_VTD; + /* REVISIT: + * to be precise, may only report SYSWIDE_PASID when pasid is + * supported, also may only report page_resp when PRS is supported + */ info->features = IOMMU_NESTING_FEAT_BIND_PGTBL | - IOMMU_NESTING_FEAT_CACHE_INVLD; + IOMMU_NESTING_FEAT_CACHE_INVLD | + IOMMU_NESTING_FEAT_PAGE_RESP; info->pasid_bits = ilog2(intel_pasid_max_id); memset(&info->padding, 0x0, 12); @@ -6471,7 +6461,7 @@ static void intel_iommu_detach_dev_pasid(struct iommu_domain *domain, return; spin_lock_irqsave(&iommu->lock, flags); - intel_pasid_tear_down_entry(iommu, dev, pasid, false); + intel_pasid_tear_down_entry(iommu, dev, pasid, false, false); spin_unlock_irqrestore(&iommu->lock, flags); } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 5f3327a87d22e660b792bfa0cc6d55d8d7ec8b19..bce8c205173a6eef07a328db43ccb1185d449cc8 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -309,14 +309,30 @@ static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) } static void -intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) +intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev, + u32 pasid, bool fault_ignore, bool keep_pte) { struct pasid_entry *pe; + u64 pe_val; + bool nested; pe = intel_pasid_get_entry(dev, pasid); if (WARN_ON(!pe)) return; + /* + * The guest may reboot from scalable mode to legacy mode. During this + * phase, there is no chance to setup SLT. So, we should only reset PGTT + * from NESTED to SL and keep other bits when unbind gpasid is executed. + */ + pe_val = READ_ONCE(pe->val[0]); + nested = (((pe_val >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED) ? true : false; + if (nested && keep_pte) { + pe_val &= 0xfffffffffffffebf; + WRITE_ONCE(pe->val[0], pe_val); + return; + } + if (fault_ignore && pasid_pte_is_present(pe)) pasid_clear_entry_with_fpd(pe); else @@ -527,13 +543,13 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, static void flush_iotlb_all(struct intel_iommu *iommu, struct device *dev, - u16 did, u16 pgtt, u32 pasid, u64 type) + u16 did, u32 pasid, u64 type) { pasid_cache_invalidation_with_pasid(iommu, did, pasid); if (type) iommu->flush.flush_iotlb(iommu, did, 0, 0, type); - else if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY) + else qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); if (!cap_caching_mode(iommu->cap)) @@ -541,10 +557,12 @@ flush_iotlb_all(struct intel_iommu *iommu, struct device *dev, } void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, - u32 pasid, bool fault_ignore) + u32 pasid, bool fault_ignore, bool keep_pte) { struct pasid_entry *pte; - u16 did, pgtt; + u16 did; + u64 pe_val; + u16 pgtt_type; pte = intel_pasid_get_entry(dev, pasid); if (WARN_ON(!pte)) @@ -554,14 +572,19 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, return; did = pasid_get_domain_id(pte); - pgtt = pasid_pte_get_pgtt(pte); + pe_val = READ_ONCE(pte->val[0]); + pgtt_type = (pe_val >> 6) & 0x7; - intel_pasid_clear_entry(dev, pasid, fault_ignore); + intel_pasid_clear_entry(iommu, dev, pasid, fault_ignore, keep_pte); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(pte, sizeof(*pte)); - flush_iotlb_all(iommu, dev, did, pgtt, pasid, 0); + if (pgtt_type == PASID_ENTRY_PGTT_FL_ONLY || + pgtt_type == PASID_ENTRY_PGTT_PT) + flush_iotlb_all(iommu, dev, did, pasid, 0); + else + flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH); } /* @@ -873,10 +896,11 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return -EINVAL; /* - * Caller must ensure PASID entry is not in use, i.e. not bind the - * same PASID to the same device twice. + * PASID entries with nesting translation type should not be set + * multiple times. If caller tries to setup nesting for a PASID + * entry which is already nested mode, should fail it. */ - if (pasid_pte_is_present(pte)) + if (pasid_pte_is_present(pte) && pasid_pte_is_nested(pte)) return -EBUSY; pasid_clear_entry(pte); @@ -978,7 +1002,7 @@ int intel_pasid_setup_slade(struct device *dev, struct dmar_domain *domain, pasid_set_slade(pte, value); - flush_iotlb_all(iommu, dev, did, 0, pasid, DMA_TLB_DSI_FLUSH); + flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH); return 0; } diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 00a5860d0390303befa09c07e627d12a773c31d7..80cb29a635387bfc18858ff5a14872cf902860c8 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -98,10 +98,10 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte) return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; } -/* Get PGTT field of a PASID table entry */ -static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) +/* Check if PGTT bits of a PASID table entry is nested. */ +static inline bool pasid_pte_is_nested(struct pasid_entry *pte) { - return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); + return ((READ_ONCE(pte->val[0]) >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED; } extern unsigned int intel_pasid_max_id; @@ -124,7 +124,7 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, - bool fault_ignore); + bool fault_ignore, bool keep_pte); int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid); void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid); int intel_pasid_setup_slade(struct device *dev, struct dmar_domain *domain, diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 5ce6cbd463cce14fe1946a6a1cbfc5ca764971c2..4674bc502882503f298e08926527208aaa1cb0fa 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -154,14 +154,15 @@ static inline bool intel_svm_capable(struct intel_iommu *iommu) return iommu->flags & VTD_FLAG_SVM_CAPABLE; } -static inline void intel_svm_drop_pasid(ioasid_t pasid) +static inline void intel_svm_drop_pasid(ioasid_t pasid, u64 flags) { /* * Detaching SPID results in UNBIND notification on the set, we must * do this before dropping the IOASID reference, otherwise the * notification chain may get destroyed. */ - ioasid_detach_spid(pasid); + if (!(flags & IOMMU_SVA_HPASID_DEF)) + ioasid_detach_spid(pasid); ioasid_detach_data(pasid); ioasid_put(NULL, pasid); } @@ -184,9 +185,14 @@ static void intel_svm_free_async_fn(struct work_struct *work) list_del_rcu(&sdev->list); spin_lock(&sdev->iommu->lock); intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, - svm->pasid, true); + svm->pasid, true, false); intel_svm_drain_prq(sdev->dev, svm->pasid); spin_unlock(&sdev->iommu->lock); + /* + * Partial assignment needs to delete fault data + */ + if (is_aux_domain(sdev->dev, &sdev->domain->domain)) + iommu_delete_device_fault_data(sdev->dev, svm->pasid); kfree_rcu(sdev, rcu); } /* @@ -194,7 +200,7 @@ static void intel_svm_free_async_fn(struct work_struct *work) * the PASID is in FREE_PENDING state, no one can get new reference. * Therefore, we can safely free the private data svm. */ - intel_svm_drop_pasid(svm->pasid); + intel_svm_drop_pasid(svm->pasid, 0); /* * Free before unbind can only happen with host PASIDs used for @@ -349,7 +355,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) rcu_read_lock(); list_for_each_entry_rcu(sdev, &svm->devs, list) intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, - svm->pasid, true); + svm->pasid, true, false); rcu_read_unlock(); } @@ -378,8 +384,12 @@ static int pasid_to_svm_sdev(struct device *dev, return -EINVAL; svm = ioasid_find(set, pasid, NULL); - if (IS_ERR(svm)) - return PTR_ERR(svm); + if (IS_ERR(svm)) { + if (pasid == PASID_RID2PASID) + svm = NULL; + else + return PTR_ERR(svm); + } if (!svm) goto out; @@ -399,8 +409,10 @@ static int pasid_to_svm_sdev(struct device *dev, return 0; } -int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, - struct iommu_gpasid_bind_data *data) +int intel_svm_bind_gpasid(struct iommu_domain *domain, + struct device *dev, + struct iommu_gpasid_bind_data *data, + void *fault_data) { struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); struct intel_svm_dev *sdev = NULL; @@ -409,6 +421,8 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, struct intel_svm *svm = NULL; unsigned long iflags; int ret = 0; + struct ioasid_set *pasid_set; + u64 hpasid_org; if (WARN_ON(!iommu) || !data) return -EINVAL; @@ -427,25 +441,35 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, if (!dev_is_pci(dev)) return -ENOTSUPP; - /* VT-d supports devices with full 20 bit PASIDs only */ - if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) + /* Except gIOVA binding, VT-d supports devices with full 20 bit PASIDs only */ + if ((data->flags & IOMMU_SVA_HPASID_DEF) == 0 && + pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) return -EINVAL; + dmar_domain = to_dmar_domain(domain); + pasid_set = NULL; //dmar_domain->pasid_set; + /* * We only check host PASID range, we have no knowledge to check * guest PASID range. */ - if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) + if (data->flags & IOMMU_SVA_HPASID_DEF) { + ret = domain_get_pasid(domain, dev); + if (ret < 0) + return ret; + hpasid_org = data->hpasid; + data->hpasid = ret; + /* TODO: may consider to use NULL because host_pasid_set is native scope */ + pasid_set = host_pasid_set; + } else if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) return -EINVAL; info = get_domain_info(dev); if (!info) return -EINVAL; - dmar_domain = to_dmar_domain(domain); - mutex_lock(&pasid_mutex); - ret = pasid_to_svm_sdev(dev, NULL, + ret = pasid_to_svm_sdev(dev, pasid_set, data->hpasid, &svm, &sdev); if (ret) goto out; @@ -473,7 +497,14 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, if (data->flags & IOMMU_SVA_GPASID_VAL) { svm->gpasid = data->gpasid; svm->flags |= SVM_FLAG_GUEST_PASID; - ioasid_attach_spid(data->hpasid, data->gpasid); + if (!(data->flags & IOMMU_SVA_HPASID_DEF)) + ioasid_attach_spid(data->hpasid, data->gpasid); + /* + * Partial assignment needs to add fault data per-pasid + */ + if (is_aux_domain(dev, domain) && fault_data) + iommu_add_device_fault_data(dev, data->hpasid, + fault_data); } ioasid_attach_data(data->hpasid, svm); ioasid_get(NULL, svm->pasid); @@ -492,17 +523,22 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, sdev->dev = dev; sdev->sid = PCI_DEVID(info->bus, info->devfn); sdev->iommu = iommu; + sdev->domain = dmar_domain; /* Only count users if device has aux domains */ if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) sdev->users = 1; - /* Set up device context entry for PASID if not enabled already */ - ret = intel_iommu_enable_pasid(iommu, sdev->dev); - if (ret) { - dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); - kfree(sdev); - goto out; + /* For legacy device passthr giova usage, do not enable pasid */ + if ((data->flags & IOMMU_SVA_HPASID_DEF) == 0 && + pci_max_pasids(to_pci_dev(dev)) == PASID_MAX) { + /* Set up device context entry for PASID if not enabled already */ + ret = intel_iommu_enable_pasid(iommu, sdev->dev); + if (ret) { + dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); + kfree(sdev); + goto out; + } } /* @@ -538,22 +574,41 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, kfree(svm); } + if (data->flags & IOMMU_SVA_HPASID_DEF) + data->hpasid = hpasid_org; + mutex_unlock(&pasid_mutex); return ret; } -int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) +int intel_svm_unbind_gpasid(struct iommu_domain *domain, + struct device *dev, u32 pasid, u64 user_flags) { struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); struct intel_svm_dev *sdev; struct intel_svm *svm; int ret; + struct dmar_domain *dmar_domain; + struct ioasid_set *pasid_set; + bool keep_pte = false; if (WARN_ON(!iommu)) return -EINVAL; + dmar_domain = to_dmar_domain(domain); + pasid_set = NULL; // dmar_domain->pasid_set; + + if (user_flags & IOMMU_SVA_HPASID_DEF) { + ret = domain_get_pasid(domain, dev); + if (ret < 0) + return ret; + pasid = ret; + pasid_set = host_pasid_set; + keep_pte = true; + } + mutex_lock(&pasid_mutex); - ret = pasid_to_svm_sdev(dev, NULL, pasid, &svm, &sdev); + ret = pasid_to_svm_sdev(dev, pasid_set, pasid, &svm, &sdev); if (ret) goto out; @@ -563,8 +618,13 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) if (!sdev->users) { list_del_rcu(&sdev->list); intel_pasid_tear_down_entry(iommu, dev, - svm->pasid, false); + svm->pasid, false, keep_pte); intel_svm_drain_prq(dev, svm->pasid); + /* + * Partial assignment needs to delete fault data + */ + if (is_aux_domain(dev, domain)) + iommu_delete_device_fault_data(dev, pasid); kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { @@ -577,7 +637,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) * the unbind, IOMMU driver will get notified * and perform cleanup. */ - intel_svm_drop_pasid(pasid); + intel_svm_drop_pasid(pasid, user_flags); kfree(svm); } } @@ -689,6 +749,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, sdev->qdep = 0; } + sdev->domain = info->domain; /* Finish the setup now we know we're keeping it */ sdev->users = 1; init_rcu_head(&sdev->rcu); @@ -810,7 +871,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) * large and has to be physically contiguous. So it's * hard to be as defensive as we might like. */ intel_pasid_tear_down_entry(iommu, dev, - svm->pasid, false); + svm->pasid, false, false); intel_svm_drain_prq(dev, svm->pasid); kfree_rcu(sdev, rcu); @@ -996,6 +1057,7 @@ static int prq_to_iommu_prot(struct page_req_dsc *req) static int intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) { + struct device_domain_info *info; struct iommu_fault_event event; if (!dev || !dev_is_pci(dev)) @@ -1028,6 +1090,16 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) sizeof(desc->priv_data)); } + /* + * If the device supports PASID granu scalable mode, reports the + * PASID as vector such that handlers can be dispatched with per + * vector data. + */ + info = get_domain_info(dev); + if (!list_empty(&info->subdevices)) { + dev_dbg(dev, "Aux domain present, assign vector %d\n", desc->pasid); + event.vector = desc->pasid; + } return iommu_report_device_fault(dev, &event); } @@ -1126,65 +1198,71 @@ static irqreturn_t prq_event_thread(int irq, void *d) struct intel_svm_dev *sdev = NULL; struct intel_iommu *iommu = d; struct intel_svm *svm = NULL; - struct page_req_dsc *req; - int head, tail, handled; - u64 address; + int head, tail, handled = 0; + unsigned int flags = 0; - /* - * Clear PPR bit before reading head/tail registers, to ensure that - * we get a new interrupt if needed. - */ + /* Clear PPR bit before reading head/tail registers, to + * ensure that we get a new interrupt if needed. */ writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - handled = (head != tail); while (head != tail) { + struct vm_area_struct *vma; + struct page_req_dsc *req; + struct qi_desc resp; + int result; + vm_fault_t ret; + u64 address; + + handled = 1; req = &iommu->prq[head / sizeof(*req)]; + result = QI_RESP_INVALID; address = (u64)req->addr << VTD_PAGE_SHIFT; - - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", - iommu->name); -bad_req: - svm = NULL; - sdev = NULL; - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - goto prq_advance; + if (!req->pasid_present) { + pr_err("%s: Page request without PASID: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + goto no_pasid; } - - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; + /* We shall not receive page request for supervisor SVM */ + if (req->pm_req && (req->rd_req | req->wr_req)) { + pr_err("Unexpected page request in Privilege Mode"); + /* No need to find the matching sdev as for bad_req */ + goto no_pasid; } - - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { - pr_err("IOMMU: %s: Page request in Privilege Mode\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->exe_req && req->rd_req)) { - pr_err("IOMMU: %s: Execution request not supported\n", - iommu->name); - goto bad_req; + /* DMA read with exec requeset is not supported. */ + if (req->exe_req && req->rd_req) { + pr_err("Execution request not supported\n"); + goto no_pasid; } - if (!svm || svm->pasid != req->pasid) { - /* - * It can't go away, because the driver is not permitted - * to unbind the mm while any page faults are outstanding. - */ + rcu_read_lock(); svm = ioasid_find(NULL, req->pasid, NULL); - if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE)) - goto bad_req; + /* It *can't* go away, because the driver is not permitted + * to unbind the mm while any page faults are outstanding. + * So we only need RCU to protect the internal idr code. */ + rcu_read_unlock(); + if (IS_ERR_OR_NULL(svm)) { + pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", + iommu->name, req->pasid, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + goto no_pasid; + } } if (!sdev || sdev->sid != req->rid) { - sdev = svm_lookup_device_by_sid(svm, req->rid); - if (!sdev) - goto bad_req; + struct intel_svm_dev *t; + + sdev = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(t, &svm->devs, list) { + if (t->sid == req->rid) { + sdev = t; + break; + } + } + rcu_read_unlock(); } /* @@ -1192,13 +1270,76 @@ static irqreturn_t prq_event_thread(int irq, void *d) * the fault notifiers, we skip the page response here. */ if (svm->flags & SVM_FLAG_GUEST_MODE) { - if (!intel_svm_prq_report(sdev->dev, req)) + if (sdev && !intel_svm_prq_report(sdev->dev, req)) goto prq_advance; else goto bad_req; } - handle_single_prq_event(iommu, svm->mm, req); + /* Since we're using init_mm.pgd directly, we should never take + * any faults on kernel addresses. */ + if (!svm->mm) + goto bad_req; + + /* If address is not canonical, return invalid response */ + if (!is_canonical_address(address)) + goto bad_req; + + /* If the mm is already defunct, don't handle faults. */ + if (!mmget_not_zero(svm->mm)) + goto bad_req; + + mmap_read_lock(svm->mm); + vma = find_extend_vma(svm->mm, address); + if (!vma || address < vma->vm_start) + goto invalid; + + if (access_error(vma, req)) + goto invalid; + + flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE; + if (req->wr_req) + flags |= FAULT_FLAG_WRITE; + + ret = handle_mm_fault(vma, address, flags, NULL); + if (ret & VM_FAULT_ERROR) + goto invalid; + + result = QI_RESP_SUCCESS; +invalid: + mmap_read_unlock(svm->mm); + mmput(svm->mm); +bad_req: + /* We get here in the error case where the PASID lookup failed, + and these can be NULL. Do not use them below this point! */ + sdev = NULL; + svm = NULL; +no_pasid: + if (req->lpig || req->priv_data_present) { + /* + * Per VT-d spec. v3.0 ch7.7, system software must + * respond with page group response if private data + * is present (PDP) or last page in group (LPIG) bit + * is set. This is an additional VT-d feature beyond + * PCI ATS spec. + */ + resp.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_PDP(req->priv_data_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + resp.qw1 = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_LPIG(req->lpig); + resp.qw2 = 0; + resp.qw3 = 0; + + if (req->priv_data_present) + memcpy(&resp.qw2, req->priv_data, + sizeof(req->priv_data)); + qi_submit_sync(iommu, &resp, 1, 0); + } + prq_advance: head = (head + sizeof(*req)) & PRQ_RING_MASK; } @@ -1274,11 +1415,13 @@ u32 intel_svm_get_pasid(struct iommu_sva *sva) return pasid; } -int intel_svm_page_response(struct device *dev, +int intel_svm_page_response(struct iommu_domain *domain, + struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) { struct iommu_fault_page_request *prm; + struct dmar_domain *dmar_domain; struct intel_svm_dev *sdev = NULL; struct intel_svm *svm = NULL; struct intel_iommu *iommu; @@ -1317,7 +1460,8 @@ int intel_svm_page_response(struct device *dev, goto out; } - ret = pasid_to_svm_sdev(dev, NULL, + dmar_domain = to_dmar_domain(domain); + ret = pasid_to_svm_sdev(dev, NULL, // dmar_domain->pasid_set, prm->pasid, &svm, &sdev); if (ret || !sdev) { ret = -ENODEV; diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 1df8c1dcae7761feb78238ba8bfcfd78a6ed3beb..aaceb0a953caac2b8ecfab3c023dd13f9b84c8ea 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -55,6 +55,7 @@ struct iopf_group { static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, enum iommu_page_response_code status) { + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_page_response resp = { .version = IOMMU_PAGE_RESP_VERSION_1, .pasid = iopf->fault.prm.pasid, @@ -66,7 +67,7 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)) resp.flags = IOMMU_PAGE_RESP_PASID_VALID; - return iommu_page_response(dev, &resp); + return iommu_page_response(domain, dev, &resp); } static enum iommu_page_response_code diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 229e262c28bdb8c77b275c6004f687f133c5bbad..16c4829fefcf458fdc5c2a27755499ca7e6b35bf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1139,6 +1139,7 @@ int iommu_register_device_fault_handler(struct device *dev, void *data) { struct dev_iommu *param = dev->iommu; + struct iommu_fault_handler_data *hdata; int ret = 0; if (!param) @@ -1158,8 +1159,23 @@ int iommu_register_device_fault_handler(struct device *dev, ret = -ENOMEM; goto done_unlock; } + param->fault_param->handler = handler; - param->fault_param->data = data; + + hdata = kzalloc(sizeof(struct iommu_fault_handler_data), GFP_KERNEL); + if (!hdata) { + kfree(param->fault_param); + put_device(dev); + ret = -ENOMEM; + goto done_unlock; + } + + INIT_LIST_HEAD(¶m->fault_param->data); + /* Default handler data uses reserved vector 0 */ + hdata->data = data; + dev_dbg(dev, "Add IOMMU default handler data %llx\n", (u64)data); + list_add(&hdata->list, ¶m->fault_param->data); + mutex_init(¶m->fault_param->lock); INIT_LIST_HEAD(¶m->fault_param->faults); @@ -1173,6 +1189,111 @@ int iommu_register_device_fault_handler(struct device *dev, } EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler); + +/** + * iommu_add_device_fault_data() - add handler specific data + * + * For devices with partitioned resources, we may need to have multiple + * handler data that can be identified by IOMMU driver. This function + * allows device drivers to add handler specific data associated with + * a vector. When IOMMU detects device fault and its vector, handlers + * can be invoked with the matching data. + * For page request service related to DMA request with PASID, the vector + * is the PASID and the data is PASID associated data such as a mediated + * device. Vector 0 is researved for default handler data when no per vector + * data is added to device handler data list. + * + * @dev: the device + * @vector: identifies fault reporting data + * @data: opaque device handler data associated with the fault + */ +int iommu_add_device_fault_data(struct device *dev, + int vector, void *data) +{ + struct dev_iommu *param = dev->iommu; + struct iommu_fault_handler_data *hdata; + int ret = 0; + + dev_dbg(dev, "%s: vector: %d data: %llx\n", __func__, vector, (u64)data); + /* + * Fault handler must have been registered before adding handler data. + * Vector 0 is reserved for default data associated with handler. + */ + if (!param || !param->fault_param || !vector) + return -EINVAL; + + mutex_lock(¶m->lock); + + /* vector must be unique, check if we have the same vector already */ + list_for_each_entry(hdata, ¶m->fault_param->data, list) { + if (hdata->vector == vector) { + dev_err(dev, "IOMMU fault handler data exists for vector %d\n", vector); + ret = -EINVAL; + goto unlock; + } + } + + hdata = kzalloc(sizeof(struct iommu_fault_handler_data), GFP_KERNEL); + if (!hdata) { + ret = -ENOMEM; + goto unlock; + } + hdata->vector = vector; + hdata->data = data; + dev_dbg(dev, "Added IOMMU fault handler data %llx for vector %d\n", + (u64)data, vector); + list_add_tail(&hdata->list, ¶m->fault_param->data); + +unlock: + mutex_unlock(¶m->lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_add_device_fault_data); + +/** + * iommu_delete_device_fault_data() - delete handler specific data + * + * For devices with partitioned resources, we may need to have multiple + * handler data that can be identified by IOMMU driver. This function + * allows device drivers to add handler specific data associated with + * a vector. When IOMMU detects device fault and its vector, handlers + * can be invoked with the matching data. + * For page request service related to DMA request with PASID, the vector + * is the PASID and the data is PASID associated data such as a mediated + * device. + * @dev: the device + * @vector: identifies fault reporting data to be removed + */ +void iommu_delete_device_fault_data(struct device *dev, int vector) +{ + struct dev_iommu *param = dev->iommu; + struct iommu_fault_handler_data *hdata, *tmp; + + dev_dbg(dev, "%s: vector:%d\n", __func__, vector); + /* + * Fault handler must have been registered before adding handler data. + * Vector 0 is reserved for default data associated with handler. + */ + if (!param || !param->fault_param || !vector) + return; + + mutex_lock(¶m->lock); + + list_for_each_entry_safe(hdata, tmp, ¶m->fault_param->data, list) { + if (hdata->vector == vector) { + list_del(&hdata->list); + kfree(hdata); + dev_dbg(dev, "Deleted IOMMU fault handler data for vector %d\n", vector); + goto unlock; + } + } + dev_err(dev, "Failed to find handler data for vector %d\n", vector); + +unlock: + mutex_unlock(¶m->lock); +} +EXPORT_SYMBOL_GPL(iommu_delete_device_fault_data); + /** * iommu_unregister_device_fault_handler() - Unregister the device fault handler * @dev: the device @@ -1186,6 +1307,8 @@ int iommu_unregister_device_fault_handler(struct device *dev) { struct dev_iommu *param = dev->iommu; int ret = 0; + struct iommu_fault_event *evt, *next; + struct iommu_fault_handler_data *hdata, *tmp; if (!param) return -EINVAL; @@ -1197,8 +1320,32 @@ int iommu_unregister_device_fault_handler(struct device *dev) /* we cannot unregister handler if there are pending faults */ if (!list_empty(¶m->fault_param->faults)) { - ret = -EBUSY; - goto unlock; + /* + * REVISIT: We should not run into pending faults if we do unbind first. + * the proper termination flow will ensure no pending faults as follows: + * 1. pasid disable and tlb flush + * 2. unbind, free, flush and drain + * 3. unregister fault handler. + */ + dev_dbg(dev, "%s, there is pending faults on dev: %s, here we force" + "to free the fault events and unregister the fault" + "handler, but this changes should be reverted when page" + "response path is ready\n", __func__, dev_name(dev)); + mutex_lock(¶m->fault_param->lock); + list_for_each_entry_safe(evt, next, ¶m->fault_param->faults, list) { + dev_dbg(dev, "%s, free fault event: 0x%lx\n", __func__, + (unsigned long) evt); + list_del(&evt->list); + kfree(evt); + } + mutex_unlock(¶m->fault_param->lock); + } + /* TODO: Free handler data if any */ + list_for_each_entry_safe(hdata, tmp, ¶m->fault_param->data, list) { + dev_dbg(dev, "%s: free handler data %llx vector %d\n", __func__, + (u64)hdata->data, hdata->vector); + list_del(&hdata->list); + kfree(hdata); } kfree(param->fault_param); @@ -1226,8 +1373,10 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) { struct dev_iommu *param = dev->iommu; struct iommu_fault_event *evt_pending = NULL; + struct iommu_fault_handler_data *hdata; struct iommu_fault_param *fparam; struct timer_list *tmr; + void *handler_data = NULL; int ret = 0; u64 exp; @@ -1254,7 +1403,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) exp = get_jiffies_64() + prq_timeout; evt_pending->expire = exp; mutex_lock(&fparam->lock); - if (list_empty(&fparam->faults)) { + if (list_empty(&fparam->faults) && prq_timeout) { /* First pending event, start timer */ tmr = &dev->iommu->fault_param->timer; WARN_ON(timer_pending(tmr)); @@ -1265,7 +1414,38 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) mutex_unlock(&fparam->lock); } - ret = fparam->handler(&evt->fault, fparam->data); + if (!evt->vector) { + hdata = list_first_entry(&fparam->data, + struct iommu_fault_handler_data, list); + handler_data = hdata->data; + dev_dbg(dev, "%s:default handler data %llx\n", + __func__, (u64)handler_data); + } else { + /* Find data for matching vector */ + list_for_each_entry(hdata, ¶m->fault_param->data, list) { + dev_dbg(dev, "Searching handler data vector %d to match %llu\n", + hdata->vector, evt->vector); + + if (hdata->vector == evt->vector) { + handler_data = hdata->data; + dev_dbg(dev, "IOMMU report data %llx on fault vector %llu\n", + (u64)handler_data, evt->vector); + break; + } + } + } + if (!handler_data) { + dev_err(dev, "No valid handler data for vector %llu\n", evt->vector); + if (evt_pending) + list_del(&evt_pending->list); + ret = -ENODEV; + goto done_unlock; + } + dev_dbg(dev, "%s: calling handler with data %llx\n", + __func__, (u64)handler_data); + + ret = fparam->handler(&evt->fault, handler_data); + trace_dev_fault(dev, &evt->fault); if (ret && evt_pending) { mutex_lock(&fparam->lock); list_del(&evt_pending->list); @@ -1280,7 +1460,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) EXPORT_SYMBOL_GPL(iommu_report_device_fault); static int iommu_page_response_prepare_msg(void __user *udata, - struct iommu_page_response *msg) + struct iommu_page_response *msg) { unsigned long minsz, maxsz; @@ -1314,7 +1494,8 @@ static int iommu_page_response_prepare_msg(void __user *udata, return 0; } -int iommu_page_response(struct device *dev, +int iommu_page_response(struct iommu_domain *domain, + struct device *dev, void __user *uinfo) { bool needs_pasid; @@ -1323,7 +1504,6 @@ int iommu_page_response(struct device *dev, struct iommu_fault_event *evt; struct iommu_fault_page_request *prm; struct dev_iommu *param = dev->iommu; - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); bool has_pasid; if (!domain || !domain->ops->page_response) @@ -1369,7 +1549,7 @@ int iommu_page_response(struct device *dev, msg.pasid = 0; } - ret = domain->ops->page_response(dev, evt, &msg); + ret = domain->ops->page_response(domain, dev, evt, &msg); trace_dev_page_response(dev, &msg); list_del(&evt->list); kfree(evt); @@ -1377,7 +1557,7 @@ int iommu_page_response(struct device *dev, } /* stop response timer if no more pending request */ - if (list_empty(¶m->fault_param->faults) && + if (prq_timeout && list_empty(¶m->fault_param->faults) && timer_pending(¶m->fault_param->timer)) { pr_debug("no pending PRQ, stop timer\n"); del_timer(¶m->fault_param->timer); @@ -2226,7 +2406,7 @@ static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data) return -EINVAL; /* Check all flags */ - mask = IOMMU_SVA_GPASID_VAL; + mask = IOMMU_SVA_GPASID_VAL | IOMMU_SVA_HPASID_DEF; if (data->flags & ~mask) return -EINVAL; @@ -2271,8 +2451,15 @@ static int iommu_sva_prepare_bind_data(void __user *udata, return iommu_check_bind_data(data); } + +/* + * Caller could provide fault_data to differentiate future page + * requests from the device. This is helpful for page request + * handling for partial assignments of physical devices. e.g. + * mediated device assingment or other sub-device solution. + */ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev, - void __user *udata) + void __user *udata, void *fault_data) { struct iommu_gpasid_bind_data data = { 0 }; int ret; @@ -2284,10 +2471,11 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev, if (ret) return ret; - ret = ioasid_get_if_owned(data.hpasid); + ret = ioasid_get(NULL, data.hpasid); if (ret) return ret; - ret = domain->ops->sva_bind_gpasid(domain, dev, &data); + + ret = domain->ops->sva_bind_gpasid(domain, dev, &data, fault_data); ioasid_put(NULL, data.hpasid); return ret; @@ -2295,12 +2483,13 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev, EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid); int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid) + ioasid_t pasid, u64 flags) { + pr_warn("%s: FIXME need to clear all pending faults!\n", __func__); if (unlikely(!domain->ops->sva_unbind_gpasid)) return -ENODEV; - return domain->ops->sva_unbind_gpasid(dev, pasid); + return domain->ops->sva_unbind_gpasid(domain, dev, pasid, flags); } EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid); @@ -2317,10 +2506,10 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev if (ret) return ret; - ret = ioasid_get_if_owned(data.hpasid); + ret = ioasid_get(NULL, data.hpasid); if (ret) return ret; - ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid); + ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid, data.flags); ioasid_put(NULL, data.hpasid); return ret; diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 4600e97acb26457bd9a4382aae59c009b6d1f389..0c8810002404f5b454dc0b5fd9093c1aa4c8c66d 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -22,6 +22,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn); static void init_iova_rcaches(struct iova_domain *iovad); +static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad); static void free_iova_rcaches(struct iova_domain *iovad); static void fq_destroy_all_entries(struct iova_domain *iovad); static void fq_flush_timeout(struct timer_list *t); @@ -1029,7 +1030,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) /* * free all the IOVA ranges cached by a cpu (used when cpu is unplugged) */ -void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad) +static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad) { struct iova_cpu_rcache *cpu_rcache; struct iova_rcache *rcache; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index b5a7e2a4f47e5b8a1d90e02445415a90b181a75e..1f0c8a1e65178de81820aa691f7283a2d4bd4997 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -40,6 +40,7 @@ #include #include #include +#include #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson " @@ -159,6 +160,26 @@ struct domain_capsule { u64 flags; }; +/* iommu->lock must be held */ +static int vfio_prepare_nesting_domain_capsule(struct vfio_iommu *iommu, + struct domain_capsule *dc) +{ + struct vfio_domain *domain; + struct vfio_group *group; + + if (!iommu->nesting_info) + return -EINVAL; + + domain = list_first_entry(&iommu->domain_list, + struct vfio_domain, next); + group = list_first_entry(&domain->group_list, + struct vfio_group, next); + dc->group = group; + dc->domain = domain->domain; + dc->user = true; + return 0; +} + static int put_pfn(unsigned long pfn, int prot); static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, @@ -2485,6 +2506,79 @@ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu, return ret; } +static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + unsigned long arg = *(unsigned long *)dc->data; + struct mdev_device *mdev = to_mdev_device(dev); + struct device *iommu_device; + void *iommu_fault_data = NULL; + + iommu_device = vfio_get_iommu_device(dc->group, dev); + if (!iommu_device) + return -EINVAL; + + if (iommu_device != dev) + iommu_fault_data = mdev_get_iommu_fault_data(mdev); + + return iommu_uapi_sva_bind_gpasid(dc->domain, iommu_device, + (void __user *)arg, + iommu_fault_data); +} + +static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + struct device *iommu_device; + + iommu_device = vfio_get_iommu_device(dc->group, dev); + if (!iommu_device) + return -EINVAL; + + /* + * dc->user is a toggle for the unbind operation. When user + * set, the dc->data passes in a __user pointer and requires + * to use iommu_uapi_sva_unbind_gpasid(), in which it will + * copy the unbind data from the user buffer. When user is + * clear, the dc->data passes in a pasid which is going to + * be unbind no need to copy data from userspace. + */ + if (dc->user) { + unsigned long arg = *(unsigned long *)dc->data; + + iommu_uapi_sva_unbind_gpasid(dc->domain, iommu_device, + (void __user *)arg); + } else { + ioasid_t pasid = *(ioasid_t *)dc->data; + + iommu_sva_unbind_gpasid(dc->domain, iommu_device, pasid, dc->flags); + } + return 0; +} + +static void vfio_group_unbind_gpasid_fn(ioasid_t pasid, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + + dc->user = false; + dc->data = &pasid; + + iommu_group_for_each_dev(dc->group->iommu_group, + dc, vfio_dev_unbind_gpasid_fn); +} + +static void vfio_group_unbind_default_gpasid(ioasid_t pasid, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + + dc->user = false; + dc->data = &pasid; + dc->flags = IOMMU_SVA_HPASID_DEF; + + iommu_group_for_each_dev(dc->group->iommu_group, + dc, vfio_dev_unbind_gpasid_fn); +} + static void vfio_iommu_type1_detach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -2529,6 +2623,33 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (!group) continue; +#if IS_ENABLED(CONFIG_IOASID_USER) + if (iommu->nesting_info && + iommu->nesting_info->features & + IOMMU_NESTING_FEAT_BIND_PGTBL) { + struct domain_capsule dc = { .group = group, + .domain = domain->domain, + .data = NULL }; + struct ioasid_user *iuser; + + /* + * For devices attached to nesting type iommu, + * VFIO should unbind page tables bound with the + * devices in the iommu group before detaching. + */ + iuser = ioasid_user_get_from_task(current); + if (!(IS_ERR(iuser) || !iuser)) { + ioasid_user_for_each_id(iuser, &dc, + vfio_group_unbind_gpasid_fn); + ioasid_user_put(iuser); + } + /* + * We should explicitly call interface to unbind default pasid gIOVA + * page table here. + */ + vfio_group_unbind_default_gpasid(0, &dc); + } +#endif vfio_iommu_detach_group(domain, group); update_dirty_scope = !group->pinned_page_dirty_scope; update_iommu_hwdbm = !group->iommu_hwdbm; @@ -3071,6 +3192,153 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, return -EINVAL; } +static long vfio_iommu_handle_pgtbl_op(struct vfio_iommu *iommu, + bool is_bind, unsigned long arg) +{ + struct domain_capsule dc = { .data = &arg, .user = true }; + struct iommu_nesting_info *info; + int ret; + + mutex_lock(&iommu->lock); + + info = iommu->nesting_info; + if (!info || !(info->features & IOMMU_NESTING_FEAT_BIND_PGTBL)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = vfio_prepare_nesting_domain_capsule(iommu, &dc); + if (ret) + goto out_unlock; + + if (is_bind) + ret = iommu_group_for_each_dev(dc.group->iommu_group, &dc, + vfio_dev_bind_gpasid_fn); + if (ret || !is_bind) + iommu_group_for_each_dev(dc.group->iommu_group, + &dc, vfio_dev_unbind_gpasid_fn); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + +static int vfio_dev_cache_invalidate_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + unsigned long arg = *(unsigned long *)dc->data; + struct device *iommu_device; + + iommu_device = vfio_get_iommu_device(dc->group, dev); + if (!iommu_device) + return -EINVAL; + + iommu_uapi_cache_invalidate(dc->domain, iommu_device, + (void __user *)arg); + return 0; +} + +static long vfio_iommu_invalidate_cache(struct vfio_iommu *iommu, + unsigned long arg) +{ + struct domain_capsule dc = { .data = &arg }; + struct iommu_nesting_info *info; + int ret; + + mutex_lock(&iommu->lock); + info = iommu->nesting_info; + if (!info || !(info->features & IOMMU_NESTING_FEAT_CACHE_INVLD)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = vfio_prepare_nesting_domain_capsule(iommu, &dc); + if (ret) + goto out_unlock; + + iommu_group_for_each_dev(dc.group->iommu_group, &dc, + vfio_dev_cache_invalidate_fn); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + +static int vfio_dev_page_resp_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + unsigned long arg = *(unsigned long *) dc->data; + struct device *iommu_device; + + iommu_device = vfio_get_iommu_device(dc->group, dev); + if (!iommu_device) + return -EINVAL; + + return iommu_page_response(dc->domain, iommu_device, + (void __user *) arg); +} + +static long vfio_iommu_page_response(struct vfio_iommu *iommu, + unsigned long arg) +{ + struct domain_capsule dc = { .data = &arg }; + struct iommu_nesting_info *info; + int ret; + + mutex_lock(&iommu->lock); + info = iommu->nesting_info; + if (!info || !(info->features & IOMMU_NESTING_FEAT_PAGE_RESP)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = vfio_prepare_nesting_domain_capsule(iommu, &dc); + if (ret) + goto out_unlock; + + ret = iommu_group_for_each_dev(dc.group->iommu_group, &dc, + vfio_dev_page_resp_fn); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + +static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu, + unsigned long arg) +{ + struct vfio_iommu_type1_nesting_op hdr; + unsigned int minsz; + int ret; + + minsz = offsetofend(struct vfio_iommu_type1_nesting_op, flags); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.flags & ~VFIO_NESTING_OP_MASK) + return -EINVAL; + + switch (hdr.flags & VFIO_NESTING_OP_MASK) { + case VFIO_IOMMU_NESTING_OP_BIND_PGTBL: + ret = vfio_iommu_handle_pgtbl_op(iommu, true, arg + minsz); + break; + case VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL: + ret = vfio_iommu_handle_pgtbl_op(iommu, false, arg + minsz); + break; + case VFIO_IOMMU_NESTING_OP_CACHE_INVLD: + ret = vfio_iommu_invalidate_cache(iommu, arg + minsz); + break; + case VFIO_IOMMU_NESTING_OP_PAGE_RESP: + ret = vfio_iommu_page_response(iommu, arg + minsz); + break; + default: + ret = -EINVAL; + } + + return ret; +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -3087,6 +3355,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return vfio_iommu_type1_unmap_dma(iommu, arg); case VFIO_IOMMU_DIRTY_PAGES: return vfio_iommu_type1_dirty_pages(iommu, arg); + case VFIO_IOMMU_NESTING_OP: + return vfio_iommu_type1_nesting_op(iommu, arg); default: return -ENOTTY; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4c90a92e1f5dc6888c60293328aeadf613bb8cd4..54f8b61d5bce80249a5bff58145e93a863b26407 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -57,7 +57,6 @@ enum cpuhp_state { CPUHP_PAGE_ALLOC_DEAD, CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, - CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index bbf29c981bbdd7b932858a375f25f25363684df8..d810139229e2d9d6ad2df21a07b52bdd08f1b531 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -84,10 +84,5 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he { } -static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, - struct iommu_domain *domain) -{ -} - #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 37171ae2afe0a7ea752cc94a83fde77b0fb9efff..e4937dd0a6e2dfa8605ac54a25c37570559ad6c0 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -760,19 +760,25 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); struct dmar_domain *find_domain(struct device *dev); struct device_domain_info *get_domain_info(struct device *dev); struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); +int domain_get_pasid(struct iommu_domain *domain, struct device *dev); #ifdef CONFIG_INTEL_IOMMU_SVM extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); +inline bool is_aux_domain(struct device *dev, + struct iommu_domain *domain); int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, - struct iommu_gpasid_bind_data *data); -int intel_svm_unbind_gpasid(struct device *dev, u32 pasid); + struct iommu_gpasid_bind_data *data, + void *fault_data); +int intel_svm_unbind_gpasid(struct iommu_domain *domain, + struct device *dev, u32 pasid, u64 user_flags); struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata); void intel_svm_unbind(struct iommu_sva *handle); u32 intel_svm_get_pasid(struct iommu_sva *handle); -int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, +int intel_svm_page_response(struct iommu_domain *domain, struct device *dev, + struct iommu_fault_event *evt, struct iommu_page_response *msg); void intel_svm_add_pasid_notifier(void); @@ -781,6 +787,7 @@ struct intel_svm_dev { struct rcu_head rcu; struct device *dev; struct intel_iommu *iommu; + struct dmar_domain *domain; struct iommu_sva sva; u32 pasid; int users; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e1b5ce697fecd2f722c29b95f97350b16a92bbc6..88370d6b4fa7cbddc860058fb6a7b1ae4da5bdbd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -324,15 +324,19 @@ struct iommu_ops { void (*sva_unbind)(struct iommu_sva *handle); u32 (*sva_get_pasid)(struct iommu_sva *handle); - int (*page_response)(struct device *dev, + int (*page_response)(struct iommu_domain *domain, + struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev, struct iommu_cache_invalidate_info *inv_info); int (*sva_bind_gpasid)(struct iommu_domain *domain, - struct device *dev, struct iommu_gpasid_bind_data *data); + struct device *dev, + struct iommu_gpasid_bind_data *data, + void *fault_data); - int (*sva_unbind_gpasid)(struct device *dev, u32 pasid); + int (*sva_unbind_gpasid)(struct iommu_domain *domain, + struct device *dev, u32 pasid, u64 flags); void (*sva_suspend_pasid)(struct device *dev, u32 pasid); @@ -394,19 +398,26 @@ struct iommu_fault_event { struct iommu_fault fault; struct list_head list; u64 expire; + u64 vector; +}; + +struct iommu_fault_handler_data { + u32 vector; + void *data; + struct list_head list; }; /** * struct iommu_fault_param - per-device IOMMU fault data * @handler: Callback function to handle IOMMU faults at device level - * @data: handler private data - * @faults: holds the pending faults which needs response + * @data: handler private data list + * @faults: holds the pending faults which needs response, e.g. page response. * @lock: protect pending faults list * @timer: track page request pending time limit */ struct iommu_fault_param { iommu_dev_fault_handler_t handler; - void *data; + struct list_head data; struct list_head faults; struct timer_list timer; struct mutex lock; @@ -497,11 +508,14 @@ extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain, void __user *uinfo); extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, - struct device *dev, void __user *udata); + struct device *dev, + void __user *udata, + void *fault_data); extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, void __user *udata); extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, ioasid_t pasid, + u64 flags); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern size_t iommu_pgsize(struct iommu_domain *domain, @@ -567,7 +581,12 @@ extern int iommu_unregister_device_fault_handler(struct device *dev); extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt); -extern int iommu_page_response(struct device *dev, void __user *uinfo); +extern int iommu_add_device_fault_data(struct device *dev, + int vector, void *data); +extern void iommu_delete_device_fault_data(struct device *dev, int vector); +extern int iommu_page_response(struct iommu_domain *domain, + struct device *dev, + void __user *uinfo); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); @@ -970,7 +989,20 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) return -ENODEV; } -static inline int iommu_page_response(struct device *dev, void __user *uinfo) +static inline +int iommu_add_device_fault_data(struct device *dev, int vector, void *data) +{ + return -ENODEV; +} + +static inline +void iommu_delete_device_fault_data(struct device *dev, int vector) +{ +} + +static inline int iommu_page_response(struct iommu_domain *domain, + struct device *dev, + void __user *uinfo) { return -ENODEV; } @@ -1173,7 +1205,8 @@ iommu_uapi_cache_invalidate(struct iommu_domain *domain, } static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, - struct device *dev, void __user *udata) + struct device *dev, void __user *udata, + void *fault_data) { return -ENODEV; } @@ -1186,7 +1219,8 @@ static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid) + ioasid_t pasid, + u64 flags) { return -ENODEV; } diff --git a/include/linux/iova.h b/include/linux/iova.h index a0637abffee88b0f0b12f3c6520cd5d2d77e779d..c810a95bfb308c5b13f477d7f8e705c3d8e5f34e 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -162,7 +162,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); struct iova *split_and_remove_iova(struct iova_domain *iovad, struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi); -void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad); #else static inline int iova_cache_get(void) { @@ -266,10 +265,6 @@ static inline struct iova *split_and_remove_iova(struct iova_domain *iovad, return NULL; } -static inline void free_cpu_cached_iovas(unsigned int cpu, - struct iova_domain *iovad) -{ -} #endif #endif diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index e333b7224436aeb2426917c784558bd31fd2e39b..ec5ad48704ffcf0a6c44f98ffeae8114d97b87f1 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -328,6 +328,7 @@ struct iommu_gpasid_bind_data { __u32 format; __u32 addr_width; #define IOMMU_SVA_GPASID_VAL (1 << 0) /* guest PASID valid */ +#define IOMMU_SVA_HPASID_DEF (1 << 1) /* use default host PASID */ __u64 flags; __u64 gpgd; __u64 hpasid; @@ -386,6 +387,9 @@ struct iommu_nesting_info_vtd { * | | UAPI according to vendor-specific requirement when | * | | changing the 1st level/stage page table. | * +---------------+------------------------------------------------------+ + * | PAGE_RESP | IOMMU vendor driver sets it to mandate userspace to | + * | | respone any page request reported by kernel space | + * +---------------+------------------------------------------------------+ * * data struct types defined for @format: * +================================+=====================================+ @@ -401,6 +405,7 @@ struct iommu_nesting_info { __u32 format; #define IOMMU_NESTING_FEAT_BIND_PGTBL (1 << 0) #define IOMMU_NESTING_FEAT_CACHE_INVLD (1 << 1) +#define IOMMU_NESTING_FEAT_PAGE_RESP (1 << 2) __u32 features; __u16 addr_width; __u16 pasid_bits; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 81cd642971d82e69001497d696efa054e3fd7a73..f408c1f4b183b364e31f0e59c109dc3b135c8877 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1255,6 +1255,47 @@ struct vfio_iommu_type1_dirty_bitmap_get { #define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) +/** + * VFIO_IOMMU_NESTING_OP - _IOW(VFIO_TYPE, VFIO_BASE + 18, + * struct vfio_iommu_type1_nesting_op) + * + * This interface allows userspace to utilize the nesting IOMMU + * capabilities as reported in VFIO_IOMMU_TYPE1_INFO_CAP_NESTING + * cap through VFIO_IOMMU_GET_INFO. For platforms which require + * system wide PASID, PASID will be allocated by VFIO_IOMMU_PASID + * _REQUEST. + * + * @data[] types defined for each op: + * +=================+===============================================+ + * | NESTING OP | @data[] | + * +=================+===============================================+ + * | BIND_PGTBL | struct iommu_gpasid_bind_data | + * +-----------------+-----------------------------------------------+ + * | UNBIND_PGTBL | struct iommu_gpasid_bind_data | + * +-----------------+-----------------------------------------------+ + * | CACHE_INVLD | struct iommu_cache_invalidate_info | + * +-----------------+-----------------------------------------------+ + * | PAGE_RESP | struct iommu_page_response | + * +-----------------+-----------------------------------------------+ + * + * returns: 0 on success, -errno on failure. + */ +struct vfio_iommu_type1_nesting_op { + __u32 argsz; + __u32 flags; +#define VFIO_NESTING_OP_MASK (0xffff) /* lower 16-bits for op */ + __u8 data[]; +}; + +enum { + VFIO_IOMMU_NESTING_OP_BIND_PGTBL, + VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL, + VFIO_IOMMU_NESTING_OP_CACHE_INVLD, + VFIO_IOMMU_NESTING_OP_PAGE_RESP, +}; + +#define VFIO_IOMMU_NESTING_OP _IO(VFIO_TYPE, VFIO_BASE + 18) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /*