diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index a611519528d78053fb794a651529ad82b2ed05af..0d96e414b5b9b4144c20579bd9ba2fe788ce41f0 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -397,9 +397,9 @@ config ARM_SMMU_V3 Say Y here if your system includes an IOMMU device implementing the ARM SMMUv3 architecture. +if ARM_SMMU_V3 config ARM_SMMU_V3_SVA bool "Shared Virtual Addressing support for the ARM SMMUv3" - depends on ARM_SMMU_V3 select IOMMU_SVA select IOMMU_IOPF select MMU_NOTIFIER @@ -435,6 +435,16 @@ config ARM_SMMU_V3_ECMDQ If not sure, say no. +config ARM_SMMU_V3_KUNIT_TEST + tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to unit-test arm-smmu-v3 driver functions. + + If unsure, say N. +endif + config S390_IOMMU def_bool y if S390 && PCI depends on S390 && PCI diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index 54feb1ecccad89924356438fb1bb2375a6957f15..014a997753a8a25d5ab3d2b512f48e66eab9b6fb 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o arm_smmu_v3-objs-y += arm-smmu-v3.o arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o arm_smmu_v3-objs := $(arm_smmu_v3-objs-y) + +obj-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index fd93039038a6b1765b31eb03fce4bc2cb24841d9..0334b0a8b5b981a3a53051f77f3e23ae1fcee0dd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -12,214 +12,141 @@ #include "arm-smmu-v3.h" #include "../../io-pgtable-arm.h" -struct arm_smmu_mmu_notifier { - struct mmu_notifier mn; - struct arm_smmu_ctx_desc *cd; - bool cleared; - refcount_t refs; - struct list_head list; - struct arm_smmu_domain *domain; -}; - -#define mn_to_smmu(mn) container_of(mn, struct arm_smmu_mmu_notifier, mn) - -struct arm_smmu_bond { - struct mm_struct *mm; - struct arm_smmu_mmu_notifier *smmu_mn; - struct list_head list; -}; - -#define sva_to_bond(handle) \ - container_of(handle, struct arm_smmu_bond, sva) - static DEFINE_MUTEX(sva_lock); -/* - * Write the CD to the CD tables for all masters that this domain is attached - * to. Note that this is only used to update existing CD entries in the target - * CD table, for which it's assumed that arm_smmu_write_ctx_desc can't fail. - */ -static void arm_smmu_update_ctx_desc_devices(struct arm_smmu_domain *smmu_domain, - int ssid, - struct arm_smmu_ctx_desc *cd) +static int arm_smmu_realloc_s1_domain_asid(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_master *master; + struct arm_smmu_master_domain *master_domain; + struct arm_smmu_cd target_cd; unsigned long flags; - - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { - arm_smmu_write_ctx_desc(master, ssid, cd); - } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); -} - -/* - * Check if the CPU ASID is available on the SMMU side. If a private context - * descriptor is using it, try to replace it. - */ -static struct arm_smmu_ctx_desc * -arm_smmu_share_asid(struct mm_struct *mm, u16 asid) -{ int ret; - u32 new_asid; - struct arm_smmu_ctx_desc *cd; - struct arm_smmu_device *smmu; - struct arm_smmu_domain *smmu_domain; - - cd = xa_load(&arm_smmu_asid_xa, asid); - if (!cd) - return NULL; - if (cd->mm) { - if (WARN_ON(cd->mm != mm)) - return ERR_PTR(-EINVAL); - /* All devices bound to this mm use the same cd struct. */ - refcount_inc(&cd->refs); - return cd; - } + lockdep_assert_held(&smmu->asid_lock); - smmu_domain = container_of(cd, struct arm_smmu_domain, cd); - smmu = smmu_domain->smmu; - - ret = xa_alloc(&arm_smmu_asid_xa, &new_asid, cd, - XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); - if (ret) - return ERR_PTR(-ENOSPC); - /* - * Race with unmap: TLB invalidations will start targeting the new ASID, - * which isn't assigned yet. We'll do an invalidate-all on the old ASID - * later, so it doesn't matter. - */ - cd->asid = new_asid; /* - * Update ASID and invalidate CD in all associated masters. There will - * be some overlap between use of both ASIDs, until we invalidate the - * TLB. + * FIXME: The unmap and invalidation path doesn't take any locks but + * this is not fully safe. Since updating the CD tables is not atomic + * there is always a hole where invalidating only one ASID of two active + * ASIDs during unmap will cause the IOTLB to become stale. + * + * This approach is to hopefully shift the racing CPUs to the new ASID + * before we start programming the HW. This increases the chance that + * racing IOPTE changes will pick up an invalidation for the new ASID + * and we achieve eventual consistency. For the brief period where the + * old ASID is still in the CD entries it will become incoherent. */ - arm_smmu_update_ctx_desc_devices(smmu_domain, IOMMU_NO_PASID, cd); - - /* Invalidate TLB entries previously associated with that context */ - arm_smmu_tlb_inv_asid(smmu, asid); + ret = xa_alloc(&smmu->asid_map, &smmu_domain->asid, smmu_domain, + XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); + if (ret) + return ret; - xa_erase(&arm_smmu_asid_xa, asid); - return NULL; -} + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) { + struct arm_smmu_master *master = master_domain->master; + struct arm_smmu_cd *cdptr; -static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm) -{ - u16 asid; - int err = 0; - u64 tcr, par, reg; - struct arm_smmu_ctx_desc *cd; - struct arm_smmu_ctx_desc *ret = NULL; - - /* Don't free the mm until we release the ASID */ - mmgrab(mm); - - asid = arm64_mm_context_get(mm); - if (!asid) { - err = -ESRCH; - goto out_drop_mm; - } + cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid); + if (WARN_ON(!cdptr)) + continue; - cd = kzalloc(sizeof(*cd), GFP_KERNEL); - if (!cd) { - err = -ENOMEM; - goto out_put_context; + arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); + arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr, + &target_cd); } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + return 0; +} - refcount_set(&cd->refs, 1); +static u64 page_size_to_cd(void) +{ + static_assert(PAGE_SIZE == SZ_4K || PAGE_SIZE == SZ_16K || + PAGE_SIZE == SZ_64K); + if (PAGE_SIZE == SZ_64K) + return ARM_LPAE_TCR_TG0_64K; + if (PAGE_SIZE == SZ_16K) + return ARM_LPAE_TCR_TG0_16K; + return ARM_LPAE_TCR_TG0_4K; +} - mutex_lock(&arm_smmu_asid_lock); - ret = arm_smmu_share_asid(mm, asid); - if (ret) { - mutex_unlock(&arm_smmu_asid_lock); - goto out_free_cd; - } +static void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, + struct arm_smmu_master *master, + struct mm_struct *mm, u16 asid, + bool btm_invalidation) +{ + u64 par; + + memset(target, 0, sizeof(*target)); + + par = cpuid_feature_extract_unsigned_field( + read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1), + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + + target->data[0] = cpu_to_le64( + CTXDESC_CD_0_TCR_EPD1 | +#ifdef __BIG_ENDIAN + CTXDESC_CD_0_ENDI | +#endif + CTXDESC_CD_0_V | + FIELD_PREP(CTXDESC_CD_0_TCR_IPS, par) | + CTXDESC_CD_0_AA64 | + (master->stall_enabled ? CTXDESC_CD_0_S : 0) | + CTXDESC_CD_0_R | + CTXDESC_CD_0_A | + (btm_invalidation ? 0 : CTXDESC_CD_0_ASET) | + FIELD_PREP(CTXDESC_CD_0_ASID, asid)); + + if (master->smmu->features & ARM_SMMU_FEAT_HD) + target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HD); + if (master->smmu->features & ARM_SMMU_FEAT_HA) + target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HA); - err = xa_insert(&arm_smmu_asid_xa, asid, cd, GFP_KERNEL); - mutex_unlock(&arm_smmu_asid_lock); - - if (err) - goto out_free_asid; - - /* HA and HD will be filtered out later if not supported by the SMMU */ - tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, 64ULL - vabits_actual) | - FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, ARM_LPAE_TCR_RGN_WBWA) | - FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, ARM_LPAE_TCR_RGN_WBWA) | - FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS) | - CTXDESC_CD_0_TCR_HA | CTXDESC_CD_0_TCR_HD | - CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64; - - switch (PAGE_SIZE) { - case SZ_4K: - tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_4K); - break; - case SZ_16K: - tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_16K); - break; - case SZ_64K: - tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_64K); - break; - default: - WARN_ON(1); - err = -EINVAL; - goto out_free_asid; + /* + * If no MM is passed then this creates a SVA entry that faults + * everything. arm_smmu_write_cd_entry() can hitlessly go between these + * two entries types since TTB0 is ignored by HW when EPD0 is set. + */ + if (mm) { + target->data[0] |= cpu_to_le64( + FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, + 64ULL - vabits_actual) | + FIELD_PREP(CTXDESC_CD_0_TCR_TG0, page_size_to_cd()) | + FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, + ARM_LPAE_TCR_RGN_WBWA) | + FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, + ARM_LPAE_TCR_RGN_WBWA) | + FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS)); + + target->data[1] = cpu_to_le64(virt_to_phys(mm->pgd) & + CTXDESC_CD_1_TTB0_MASK); + } else { + target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_EPD0); + + /* + * Disable stall and immediately generate an abort if stall + * disable is permitted. This speeds up cleanup for an unclean + * exit if the device is still doing a lot of DMA. + */ + if (master->stall_enabled && + !(master->smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) + target->data[0] &= + cpu_to_le64(~(CTXDESC_CD_0_S | CTXDESC_CD_0_R)); } - reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - par = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_IPS, par); - - cd->ttbr = virt_to_phys(mm->pgd); - cd->tcr = tcr; /* * MAIR value is pretty much constant and global, so we can just get it * from the current CPU register */ - cd->mair = read_sysreg(mair_el1); - cd->asid = asid; - cd->mm = mm; - - return cd; - -out_free_asid: - arm_smmu_free_asid(cd); -out_free_cd: - kfree(cd); -out_put_context: - arm64_mm_context_put(mm); -out_drop_mm: - mmdrop(mm); - return err < 0 ? ERR_PTR(err) : ret; + target->data[3] = cpu_to_le64(read_sysreg(mair_el1)); } -static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd) -{ - if (arm_smmu_free_asid(cd)) { - /* Unpin ASID */ - arm64_mm_context_put(cd->mm); - mmdrop(cd->mm); - kfree(cd); - } -} - -/* - * Cloned from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, this - * is used as a threshold to replace per-page TLBI commands to issue in the - * command queue with an address-space TLBI command, when SMMU w/o a range - * invalidation feature handles too many per-page TLBI commands, which will - * otherwise result in a soft lockup. - */ -#define CMDQ_MAX_TLBI_OPS (1 << (PAGE_SHIFT - 3)) - static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { - struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); - struct arm_smmu_domain *smmu_domain = smmu_mn->domain; + struct arm_smmu_domain *smmu_domain = + container_of(mn, struct arm_smmu_domain, mmu_notifier); size_t size; /* @@ -228,57 +155,56 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, * range. So do a simple translation here by calculating size correctly. */ size = end - start; - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_RANGE_INV)) { - if (size >= CMDQ_MAX_TLBI_OPS * PAGE_SIZE) - size = 0; - } else { - if (size == ULONG_MAX) - size = 0; - } - - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) { - if (!size) - arm_smmu_tlb_inv_asid(smmu_domain->smmu, - smmu_mn->cd->asid); - else - arm_smmu_tlb_inv_range_asid(start, size, - smmu_mn->cd->asid, - PAGE_SIZE, false, - smmu_domain); - } + if (size == ULONG_MAX) + size = 0; - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start, - size); + if (!smmu_domain->btm_invalidation) + arm_smmu_tlb_inv_range_s1(smmu_domain, start, size, PAGE_SIZE, + false); + arm_smmu_atc_inv_domain(smmu_domain, start, size); } static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); - struct arm_smmu_domain *smmu_domain = smmu_mn->domain; - - mutex_lock(&sva_lock); - if (smmu_mn->cleared) { - mutex_unlock(&sva_lock); - return; - } + struct arm_smmu_domain *smmu_domain = + container_of(mn, struct arm_smmu_domain, mmu_notifier); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; /* * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events, * but disable translation. */ - arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), - &quiet_cd); - - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0); + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + struct arm_smmu_master *master = master_domain->master; + struct arm_smmu_cd target; + struct arm_smmu_cd *cdptr; + + cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid); + if (WARN_ON(!cdptr)) + continue; + + /* An SVA ASID never changes, no asid_lock required */ + arm_smmu_make_sva_cd(&target, master, NULL, + smmu_domain->asid, + smmu_domain->btm_invalidation); + arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr, + &target); + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - smmu_mn->cleared = true; - mutex_unlock(&sva_lock); + arm_smmu_tlb_inv_all_s1(smmu_domain); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0); } static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) { - kfree(mn_to_smmu(mn)); + struct arm_smmu_domain *smmu_domain = + container_of(mn, struct arm_smmu_domain, mmu_notifier); + + kfree(smmu_domain); } static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { @@ -287,115 +213,6 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { .free_notifier = arm_smmu_mmu_notifier_free, }; -/* Allocate or get existing MMU notifier for this {domain, mm} pair */ -static struct arm_smmu_mmu_notifier * -arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, - struct mm_struct *mm) -{ - int ret; - struct arm_smmu_ctx_desc *cd; - struct arm_smmu_mmu_notifier *smmu_mn; - - list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) { - if (smmu_mn->mn.mm == mm) { - refcount_inc(&smmu_mn->refs); - return smmu_mn; - } - } - - cd = arm_smmu_alloc_shared_cd(mm); - if (IS_ERR(cd)) - return ERR_CAST(cd); - - smmu_mn = kzalloc(sizeof(*smmu_mn), GFP_KERNEL); - if (!smmu_mn) { - ret = -ENOMEM; - goto err_free_cd; - } - - refcount_set(&smmu_mn->refs, 1); - smmu_mn->cd = cd; - smmu_mn->domain = smmu_domain; - smmu_mn->mn.ops = &arm_smmu_mmu_notifier_ops; - - ret = mmu_notifier_register(&smmu_mn->mn, mm); - if (ret) { - kfree(smmu_mn); - goto err_free_cd; - } - - list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers); - return smmu_mn; - -err_free_cd: - arm_smmu_free_shared_cd(cd); - return ERR_PTR(ret); -} - -static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) -{ - struct mm_struct *mm = smmu_mn->mn.mm; - struct arm_smmu_ctx_desc *cd = smmu_mn->cd; - struct arm_smmu_domain *smmu_domain = smmu_mn->domain; - - if (!refcount_dec_and_test(&smmu_mn->refs)) - return; - - list_del(&smmu_mn->list); - - /* - * If we went through clear(), we've already invalidated, and no - * new TLB entry can have been formed. - */ - if (!smmu_mn->cleared) { - arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, - 0); - } - - /* Frees smmu_mn */ - mmu_notifier_put(&smmu_mn->mn); - arm_smmu_free_shared_cd(cd); -} - -static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, - struct mm_struct *mm) -{ - int ret; - struct arm_smmu_bond *bond; - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - - if (!master || !master->sva_enabled) - return -ENODEV; - - bond = kzalloc(sizeof(*bond), GFP_KERNEL); - if (!bond) - return -ENOMEM; - - bond->mm = mm; - - bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm); - if (IS_ERR(bond->smmu_mn)) { - ret = PTR_ERR(bond->smmu_mn); - goto err_free_bond; - } - - ret = arm_smmu_write_ctx_desc(master, pasid, bond->smmu_mn->cd); - if (ret) - goto err_put_notifier; - - list_add(&bond->list, &master->bonds); - return 0; - -err_put_notifier: - arm_smmu_mmu_notifier_put(bond->smmu_mn); -err_free_bond: - kfree(bond); - return ret; -} - bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { unsigned long reg, fld; @@ -512,11 +329,6 @@ int arm_smmu_master_enable_sva(struct arm_smmu_master *master) int arm_smmu_master_disable_sva(struct arm_smmu_master *master) { mutex_lock(&sva_lock); - if (!list_empty(&master->bonds)) { - dev_err(master->dev, "cannot disable SVA, device is bound\n"); - mutex_unlock(&sva_lock); - return -EBUSY; - } arm_smmu_master_sva_disable_iopf(master); master->sva_enabled = false; mutex_unlock(&sva_lock); @@ -533,48 +345,51 @@ void arm_smmu_sva_notifier_synchronize(void) mmu_notifier_synchronize(); } -void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) +static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) { - struct mm_struct *mm = domain->mm; - struct arm_smmu_bond *bond = NULL, *t; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_cd target; + int ret; - mutex_lock(&sva_lock); - - arm_smmu_write_ctx_desc(master, id, NULL); - - list_for_each_entry(t, &master->bonds, list) { - if (t->mm == mm) { - bond = t; - break; - } - } - - if (!WARN_ON(!bond)) { - list_del(&bond->list); - arm_smmu_mmu_notifier_put(bond->smmu_mn); - kfree(bond); - } - mutex_unlock(&sva_lock); -} + /* Prevent arm_smmu_mm_release from being called while we are attaching */ + if (!mmget_not_zero(domain->mm)) + return -EINVAL; -static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) -{ - int ret = 0; - struct mm_struct *mm = domain->mm; + /* + * This does not need the arm_smmu_asid_lock because SVA domains never + * get reassigned + */ + arm_smmu_make_sva_cd(&target, master, smmu_domain->domain.mm, + smmu_domain->asid, + smmu_domain->btm_invalidation); - mutex_lock(&sva_lock); - ret = __arm_smmu_sva_bind(dev, id, mm); - mutex_unlock(&sva_lock); + ret = arm_smmu_set_pasid(master, smmu_domain, id, &target); + mmput(domain->mm); return ret; } static void arm_smmu_sva_domain_free(struct iommu_domain *domain) { - kfree(domain); + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + + /* + * Notice that the arm_smmu_mm_arch_invalidate_secondary_tlbs op can + * still be called/running at this point. We allow the ASID to be + * reused, and if there is a race then it just suffers harmless + * unnecessary invalidation. + */ + arm_smmu_domain_free_id(smmu_domain); + if (smmu_domain->btm_invalidation) + arm64_mm_context_put(domain->mm); + + /* + * Actual free is defered to the SRCU callback + * arm_smmu_mmu_notifier_free() + */ + mmu_notifier_put(&smmu_domain->mmu_notifier); } static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { @@ -582,14 +397,123 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { .free = arm_smmu_sva_domain_free }; -struct iommu_domain *arm_smmu_sva_domain_alloc(void) +static int arm_smmu_share_asid(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain, + struct mm_struct *mm) { - struct iommu_domain *domain; + struct arm_smmu_domain *old_s1_domain; + int ret; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - domain->ops = &arm_smmu_sva_domain_ops; + /* + * Notice that BTM is never currently enabled, this is all dead code. + * The specification cautions: + * + * Note: Arm expects that SMMU stage 2 address spaces are generally + * shared with their respective PE virtual machine stage 2 + * configuration. If broadcast invalidation is required to be avoided + * for a particular SMMU stage 2 address space, Arm recommends that a + * hypervisor configures the STE with a VMID that is not allocated for + * virtual machine use on the PEs + * + * However, in Linux, both KVM and SMMU think they own the VMID pool. + * Unfortunately the ARM design is problematic for Linux as we do not + * currently share the S2 table with KVM. This creates a situation where + * the S2 needs to have the same VMID as KVM in order to allow the guest + * to use BTM, however we must still invalidate the S2 directly since it + * is a different radix tree. What Linux would like is something like + * ASET for the STE to disable BTM only for the S2. + * + * Arguably in a system with BTM the driver should prefer to use a S1 + * table in all cases execpt when explicitly asked to create a nesting + * parent. Then it should use the VMID of KVM to enable BTM in the + * guest. We cannot optimize away the resulting double invalidation of + * the S2 :( Or we simply ignore BTM entirely as we are doing now. + */ + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) + return arm_smmu_domain_alloc_id(smmu, smmu_domain); + + /* At this point the caller ensures we have a mmget() */ + smmu_domain->asid = arm64_mm_context_get(mm); + + mutex_lock(&smmu->asid_lock); + old_s1_domain = xa_store(&smmu->asid_map, smmu_domain->asid, + smmu_domain, GFP_KERNEL); + if (xa_err(old_s1_domain)) { + ret = xa_err(old_s1_domain); + goto out_put_asid; + } + + /* + * In BTM mode the CPU ASID and the IOMMU ASID have to be the same. + * Unfortunately we run separate allocators for this and the IOMMU + * ASID can already have been assigned to a S1 domain. SVA domains + * always align to their CPU ASIDs. In this case we change + * the S1 domain's ASID, update the CD entry and flush the caches. + * + * This is a bit tricky, all the places writing to a S1 CD, reading the + * S1 ASID, or doing xa_erase must hold the asid_lock or xa_lock to + * avoid IOTLB incoherence. + */ + if (old_s1_domain) { + if (WARN_ON(old_s1_domain->domain.type == IOMMU_DOMAIN_SVA)) { + ret = -EINVAL; + goto out_restore_s1; + } + ret = arm_smmu_realloc_s1_domain_asid(smmu, old_s1_domain); + if (ret) + goto out_restore_s1; - return domain; + /* Clean the ASID since it was just recovered before using it */ + arm_smmu_tlb_inv_all_s1(smmu_domain); + } + + smmu_domain->btm_invalidation = true; + + ret = 0; + goto out_unlock; + +out_restore_s1: + xa_store(&smmu->asid_map, smmu_domain->asid, old_s1_domain, + GFP_KERNEL); +out_put_asid: + arm64_mm_context_put(mm); +out_unlock: + mutex_unlock(&smmu->asid_lock); + return ret; +} + +struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_domain *smmu_domain; + int ret; + + smmu_domain = arm_smmu_domain_alloc(); + if (IS_ERR(smmu_domain)) + return ERR_CAST(smmu_domain); + + smmu_domain->domain.type = IOMMU_DOMAIN_SVA; + smmu_domain->domain.ops = &arm_smmu_sva_domain_ops; + smmu_domain->smmu = smmu; + + ret = arm_smmu_share_asid(smmu, smmu_domain, mm); + if (ret) + goto err_free; + + smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops; + ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm); + if (ret) + goto err_asid; + + return &smmu_domain->domain; + +err_asid: + arm_smmu_domain_free_id(smmu_domain); + if (smmu_domain->btm_invalidation) + arm64_mm_context_put(mm); +err_free: + kfree(smmu_domain); + return ERR_PTR(ret); } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c new file mode 100644 index 0000000000000000000000000000000000000000..dcca393a8450df845eade57a89debc6faccf1342 --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2024 Google LLC. + */ +#include +#include + +#include "arm-smmu-v3.h" + +struct arm_smmu_test_writer { + struct arm_smmu_entry_writer writer; + struct kunit *test; + const __le64 *init_entry; + const __le64 *target_entry; + __le64 *entry; + + bool invalid_entry_written; + unsigned int num_syncs; +}; + +static struct arm_smmu_ste bypass_ste; +static struct arm_smmu_ste abort_ste; + +static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry, + const __le64 *used_bits, + const __le64 *target, + unsigned int length) +{ + bool differs = false; + unsigned int i; + + for (i = 0; i < length; i++) { + if ((entry[i] & used_bits[i]) != target[i]) + differs = true; + } + return differs; +} + +static void +arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer) +{ + struct arm_smmu_test_writer *test_writer = + container_of(writer, struct arm_smmu_test_writer, writer); + __le64 *entry_used_bits; + unsigned int num_entry_qwords = writer->ops->num_entry_qwords; + + entry_used_bits = kunit_kzalloc( + test_writer->test, sizeof(*entry_used_bits) * num_entry_qwords, + GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test_writer->test, entry_used_bits); + + pr_debug("STE value is now set to: "); + print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8, + test_writer->entry, + num_entry_qwords * sizeof(*test_writer->entry), + false); + + test_writer->num_syncs += 1; + if (!(test_writer->entry[0] & writer->ops->v_bit)) { + test_writer->invalid_entry_written = true; + } else { + /* + * At any stage in a hitless transition, the entry must be + * equivalent to either the initial entry or the target entry + * when only considering the bits used by the current + * configuration. + */ + writer->ops->get_used(test_writer->entry, entry_used_bits); + KUNIT_EXPECT_FALSE( + test_writer->test, + arm_smmu_entry_differs_in_used_bits( + test_writer->entry, entry_used_bits, + test_writer->init_entry, num_entry_qwords) && + arm_smmu_entry_differs_in_used_bits( + test_writer->entry, entry_used_bits, + test_writer->target_entry, + num_entry_qwords)); + } +} + +static void +arm_smmu_v3_test_ste_debug_print_used_bits(struct arm_smmu_entry_writer *writer, + const struct arm_smmu_ste *ste) +{ + struct arm_smmu_ste used_bits = {}; + + arm_smmu_get_ste_used(ste->data, used_bits.data); + pr_debug("STE used bits: "); + print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8, used_bits.data, + sizeof(used_bits), false); +} + +static const struct arm_smmu_entry_writer_ops test_ops = { + .v_bit = cpu_to_le64(STRTAB_STE_0_V), + .num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64), + .sync = arm_smmu_test_writer_record_syncs, + .get_used = arm_smmu_get_ste_used, +}; + +static void arm_smmu_v3_test_ste_expect_transition( + struct kunit *test, const struct arm_smmu_ste *cur, + const struct arm_smmu_ste *target, int num_syncs_expected, bool hitless) +{ + struct arm_smmu_ste cur_copy = *cur; + struct arm_smmu_test_writer test_writer = { + .writer = { + .ops = &test_ops, + }, + .test = test, + .init_entry = cur->data, + .target_entry = target->data, + .entry = cur_copy.data, + .num_syncs = 0, + .invalid_entry_written = false, + + }; + + pr_debug("STE initial value: "); + print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8, cur_copy.data, + sizeof(cur_copy), false); + arm_smmu_v3_test_ste_debug_print_used_bits(&test_writer.writer, cur); + pr_debug("STE target value: "); + print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8, target->data, + sizeof(cur_copy), false); + arm_smmu_v3_test_ste_debug_print_used_bits(&test_writer.writer, target); + + arm_smmu_write_entry(&test_writer.writer, cur_copy.data, target->data); + + KUNIT_EXPECT_EQ(test, test_writer.invalid_entry_written, !hitless); + KUNIT_EXPECT_EQ(test, test_writer.num_syncs, num_syncs_expected); + KUNIT_EXPECT_MEMEQ(test, target->data, cur_copy.data, sizeof(cur_copy)); +} + +static void arm_smmu_v3_test_ste_expect_non_hitless_transition( + struct kunit *test, const struct arm_smmu_ste *cur, + const struct arm_smmu_ste *target, int num_syncs_expected) +{ + arm_smmu_v3_test_ste_expect_transition(test, cur, target, + num_syncs_expected, false); +} + +static void arm_smmu_v3_test_ste_expect_hitless_transition( + struct kunit *test, const struct arm_smmu_ste *cur, + const struct arm_smmu_ste *target, int num_syncs_expected) +{ + arm_smmu_v3_test_ste_expect_transition(test, cur, target, + num_syncs_expected, true); +} + +static const dma_addr_t fake_cdtab_dma_addr = 0xF0F0F0F0F0F0; + +static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste, + unsigned int s1dss, + const dma_addr_t dma_addr) +{ + struct arm_smmu_device smmu = { .features = ARM_SMMU_FEAT_STALLS }; + struct arm_smmu_master master = { + .cd_table.cdtab_dma = dma_addr, + .cd_table.s1cdmax = 0xFF, + .cd_table.s1fmt = STRTAB_STE_0_S1FMT_64K_L2, + .smmu = &smmu, + }; + + arm_smmu_make_cdtable_ste(ste, &master, true, s1dss); +} + +static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test) +{ + /* + * Bypass STEs has used bits in the first two Qwords, while abort STEs + * only have used bits in the first QWord. Transitioning from bypass to + * abort requires two syncs: the first to set the first qword and make + * the STE into an abort, the second to clean up the second qword. + */ + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &bypass_ste, &abort_ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_abort_to_bypass(struct kunit *test) +{ + /* + * Transitioning from abort to bypass also requires two syncs: the first + * to set the second qword data required by the bypass STE, and the + * second to set the first qword and switch to bypass. + */ + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &abort_ste, &bypass_ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_cdtable_to_abort(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &ste, &abort_ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_abort_to_cdtable(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &abort_ste, &ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_cdtable_to_bypass(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &ste, &bypass_ste, + /*num_syncs_expected=*/3); +} + +static void arm_smmu_v3_write_ste_test_bypass_to_cdtable(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &bypass_ste, &ste, + /*num_syncs_expected=*/3); +} + +static void arm_smmu_v3_write_ste_test_cdtable_s1dss_change(struct kunit *test) +{ + struct arm_smmu_ste ste; + struct arm_smmu_ste s1dss_bypass; + + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS, + fake_cdtab_dma_addr); + + /* + * Flipping s1dss on a CD table STE only involves changes to the second + * qword of an STE and can be done in a single write. + */ + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &ste, &s1dss_bypass, + /*num_syncs_expected=*/1); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &s1dss_bypass, &ste, + /*num_syncs_expected=*/1); +} + +static void +arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass(struct kunit *test) +{ + struct arm_smmu_ste s1dss_bypass; + + arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS, + fake_cdtab_dma_addr); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &s1dss_bypass, &bypass_ste, + /*num_syncs_expected=*/2); +} + +static void +arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass(struct kunit *test) +{ + struct arm_smmu_ste s1dss_bypass; + + arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS, + fake_cdtab_dma_addr); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &bypass_ste, &s1dss_bypass, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste, + bool ats_enabled) +{ + struct arm_smmu_device smmu = { .features = ARM_SMMU_FEAT_STALLS }; + struct arm_smmu_master master = { + .smmu = &smmu, + }; + struct io_pgtable io_pgtable = {}; + struct arm_smmu_domain smmu_domain = { + .pgtbl_ops = &io_pgtable.ops, + }; + + io_pgtable.cfg.arm_lpae_s2_cfg.vttbr = 0xdaedbeefdeadbeefULL; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.ps = 1; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tg = 2; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sh = 3; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.orgn = 1; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.irgn = 2; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sl = 3; + io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tsz = 4; + + arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain, ats_enabled); +} + +static void arm_smmu_v3_write_ste_test_s2_to_abort(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &ste, &abort_ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_abort_to_s2(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &abort_ste, &ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_s2_to_bypass(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &ste, &bypass_ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_bypass_to_s2(struct kunit *test) +{ + struct arm_smmu_ste ste; + + arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &bypass_ste, &ste, + /*num_syncs_expected=*/2); +} + +static void arm_smmu_v3_write_ste_test_s1_to_s2(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_test_make_s2_ste(&s2_ste, true); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &s1_ste, &s2_ste, + /*num_syncs_expected=*/3); +} + +static void arm_smmu_v3_write_ste_test_s2_to_s1(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_test_make_s2_ste(&s2_ste, true); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &s2_ste, &s1_ste, + /*num_syncs_expected=*/3); +} + +static void arm_smmu_v3_write_ste_test_non_hitless(struct kunit *test) +{ + struct arm_smmu_ste ste; + struct arm_smmu_ste ste_2; + + /* + * Although no flow resembles this in practice, one way to force an STE + * update to be non-hitless is to change its CD table pointer as well as + * s1 dss field in the same update. + */ + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr); + arm_smmu_test_make_cdtable_ste(&ste_2, STRTAB_STE_1_S1DSS_BYPASS, + 0x4B4B4b4B4B); + arm_smmu_v3_test_ste_expect_non_hitless_transition( + test, &ste, &ste_2, + /*num_syncs_expected=*/3); +} + +static struct kunit_case arm_smmu_v3_test_cases[] = { + KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_abort), + KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_bypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_abort), + KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_cdtable), + KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_bypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_cdtable), + KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_s1dss_change), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_abort), + KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_s2), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_bypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_s2), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1), + KUNIT_CASE(arm_smmu_v3_write_ste_test_non_hitless), + {}, +}; + +static int arm_smmu_v3_test_suite_init(struct kunit_suite *test) +{ + arm_smmu_make_bypass_ste(&bypass_ste); + arm_smmu_make_abort_ste(&abort_ste); + return 0; +} + +static struct kunit_suite arm_smmu_v3_test_module = { + .name = "arm-smmu-v3-kunit-test", + .suite_init = arm_smmu_v3_test_suite_init, + .test_cases = arm_smmu_v3_test_cases, +}; +kunit_test_suites(&arm_smmu_v3_test_module); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 3d3e0c9350232b4796be0f2453659575bd725736..2bbf5304198b5f88525a1ccc4900b2cf88956349 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "arm-smmu-v3.h" #include "../../dma-iommu.h" @@ -99,6 +100,8 @@ static int arm_smmu_bypass_dev_domain_type(struct device *dev) } #endif +static struct iommu_ops arm_smmu_ops; + enum arm_smmu_msi_index { EVTQ_MSI_INDEX, GERROR_MSI_INDEX, @@ -106,6 +109,10 @@ enum arm_smmu_msi_index { ARM_SMMU_MAX_MSIS, }; +#define NUM_ENTRY_QWORDS \ + (max(sizeof(struct arm_smmu_ste), sizeof(struct arm_smmu_cd)) / \ + sizeof(u64)) + static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = { [EVTQ_MSI_INDEX] = { ARM_SMMU_EVTQ_IRQ_CFG0, @@ -129,21 +136,17 @@ struct arm_smmu_option_prop { const char *prop; }; -DEFINE_XARRAY_ALLOC1(arm_smmu_asid_xa); -DEFINE_MUTEX(arm_smmu_asid_lock); - -/* - * Special value used by SVA when a process dies, to quiesce a CD without - * disabling it. - */ -struct arm_smmu_ctx_desc quiet_cd = { 0 }; - static struct arm_smmu_option_prop arm_smmu_options[] = { { ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" }, { ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"}, { 0, NULL}, }; +static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_device *smmu); +static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); +static void arm_smmu_tlb_inv_all_s2(struct arm_smmu_domain *smmu_domain); + static void parse_driver_options(struct arm_smmu_device *smmu) { int i = 0; @@ -159,17 +162,22 @@ static void parse_driver_options(struct arm_smmu_device *smmu) } /* Low-level queue manipulation functions */ -static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) +static int queue_ncmds(struct arm_smmu_ll_queue *q) { - u32 space, prod, cons; + u32 prod, cons; prod = Q_IDX(q, q->prod); cons = Q_IDX(q, q->cons); if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) - space = (1 << q->max_n_shift) - (prod - cons); + return prod - cons; else - space = cons - prod; + return (1 << q->max_n_shift) - cons + prod; +} + +static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) +{ + u32 space = (1 << q->max_n_shift) - queue_ncmds(q); return space >= n; } @@ -347,8 +355,17 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31); break; case CMDQ_OP_TLBI_NH_VA: - cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); fallthrough; + case CMDQ_OP_TLBI_NH_VAA: + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); + cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf); + cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl); + cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg); + cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK; + break; case CMDQ_OP_TLBI_EL2_VA: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num); cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale); @@ -370,6 +387,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) case CMDQ_OP_TLBI_NH_ASID: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); fallthrough; + case CMDQ_OP_TLBI_NH_ALL: case CMDQ_OP_TLBI_S12_VMALL: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); break; @@ -1171,15 +1189,190 @@ static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused } /* Context descriptor manipulation functions */ -void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) + +/* + * Based on the value of ent report which bits of the STE the HW will access. It + * would be nice if this was complete according to the spec, but minimally it + * has to capture the bits this driver uses. + */ +void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) { - struct arm_smmu_cmdq_ent cmd = { - .opcode = smmu->features & ARM_SMMU_FEAT_E2H ? - CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID, - .tlbi.asid = asid, - }; + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) + return; + + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG); + + /* S1 translates */ + if (cfg & BIT(0)) { + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT | + STRTAB_STE_0_S1CTXPTR_MASK | + STRTAB_STE_0_S1CDMAX); + used_bits[1] |= + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW | + STRTAB_STE_1_EATS); + used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); + + /* + * See 13.5 Summary of attribute/permission configuration fields + * for the SHCFG behavior. + */ + if (FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == + STRTAB_STE_1_S1DSS_BYPASS) + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); + } + + /* S2 translates */ + if (cfg & BIT(1)) { + used_bits[1] |= + cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG); + used_bits[2] |= + cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | + STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); + used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); + } + + if (cfg == STRTAB_STE_0_CFG_BYPASS) + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); +} + +/* + * Figure out if we can do a hitless update of entry to become target. Returns a + * bit mask where 1 indicates that qword needs to be set disruptively. + * unused_update is an intermediate value of entry that has unused bits set to + * their new values. + */ +static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer, + const __le64 *entry, const __le64 *target, + __le64 *unused_update) +{ + __le64 target_used[NUM_ENTRY_QWORDS] = {}; + __le64 cur_used[NUM_ENTRY_QWORDS] = {}; + u8 used_qword_diff = 0; + unsigned int i; + + writer->ops->get_used(entry, cur_used); + writer->ops->get_used(target, target_used); + + for (i = 0; i != writer->ops->num_entry_qwords; i++) { + /* + * Check that masks are up to date, the make functions are not + * allowed to set a bit to 1 if the used function doesn't say it + * is used. + */ + WARN_ON_ONCE(target[i] & ~target_used[i]); + + /* Bits can change because they are not currently being used */ + unused_update[i] = (entry[i] & cur_used[i]) | + (target[i] & ~cur_used[i]); + /* + * Each bit indicates that a used bit in a qword needs to be + * changed after unused_update is applied. + */ + if ((unused_update[i] & target_used[i]) != target[i]) + used_qword_diff |= 1 << i; + } + return used_qword_diff; +} + +static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, + const __le64 *target, unsigned int start, + unsigned int len) +{ + bool changed = false; + unsigned int i; + + for (i = start; len != 0; len--, i++) { + if (entry[i] != target[i]) { + WRITE_ONCE(entry[i], target[i]); + changed = true; + } + } + + if (changed) + writer->ops->sync(writer); + return changed; +} + +/* + * Update the STE/CD to the target configuration. The transition from the + * current entry to the target entry takes place over multiple steps that + * attempts to make the transition hitless if possible. This function takes care + * not to create a situation where the HW can perceive a corrupted entry. HW is + * only required to have a 64 bit atomicity with stores from the CPU, while + * entries are many 64 bit values big. + * + * The difference between the current value and the target value is analyzed to + * determine which of three updates are required - disruptive, hitless or no + * change. + * + * In the most general disruptive case we can make any update in three steps: + * - Disrupting the entry (V=0) + * - Fill now unused qwords, execpt qword 0 which contains V + * - Make qword 0 have the final value and valid (V=1) with a single 64 + * bit store + * + * However this disrupts the HW while it is happening. There are several + * interesting cases where a STE/CD can be updated without disturbing the HW + * because only a small number of bits are changing (S1DSS, CONFIG, etc) or + * because the used bits don't intersect. We can detect this by calculating how + * many 64 bit values need update after adjusting the unused bits and skip the + * V=0 process. This relies on the IGNORED behavior described in the + * specification. + */ +void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry, + const __le64 *target) +{ + unsigned int num_entry_qwords = writer->ops->num_entry_qwords; + __le64 unused_update[NUM_ENTRY_QWORDS]; + u8 used_qword_diff; + + used_qword_diff = + arm_smmu_entry_qword_diff(writer, entry, target, unused_update); + if (hweight8(used_qword_diff) == 1) { + /* + * Only one qword needs its used bits to be changed. This is a + * hitless update, update all bits the current STE is ignoring + * to their new values, then update a single "critical qword" to + * change the STE and finally 0 out any bits that are now unused + * in the target configuration. + */ + unsigned int critical_qword_index = ffs(used_qword_diff) - 1; + + /* + * Skip writing unused bits in the critical qword since we'll be + * writing it in the next step anyways. This can save a sync + * when the only change is in that qword. + */ + unused_update[critical_qword_index] = + entry[critical_qword_index]; + entry_set(writer, entry, unused_update, 0, num_entry_qwords); + entry_set(writer, entry, target, critical_qword_index, 1); + entry_set(writer, entry, target, 0, num_entry_qwords); + } else if (used_qword_diff) { + /* + * At least two qwords need their inuse bits to be changed. This + * requires a breaking update, zero the V bit, write all qwords + * but 0, then set qword 0 + */ + unused_update[0] = entry[0] & (~writer->ops->v_bit); + entry_set(writer, entry, unused_update, 0, 1); + entry_set(writer, entry, target, 1, num_entry_qwords - 1); + entry_set(writer, entry, target, 0, 1); + } else { + /* + * No inuse bit changed. Sanity check that all unused bits are 0 + * in the entry. The target was already sanity checked by + * compute_qword_diff(). + */ + WARN_ON_ONCE( + entry_set(writer, entry, target, 0, num_entry_qwords)); + } } static void arm_smmu_sync_cd(struct arm_smmu_master *master, @@ -1211,15 +1404,12 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master, static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, struct arm_smmu_l1_ctx_desc *l1_desc) { - size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); + size_t size = CTXDESC_L2_ENTRIES * sizeof(struct arm_smmu_cd); - l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, - &l1_desc->l2ptr_dma, GFP_KERNEL); - if (!l1_desc->l2ptr) { - dev_warn(smmu->dev, - "failed to allocate context descriptor table\n"); + l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size, + &l1_desc->l2ptr_dma, GFP_KERNEL); + if (!l1_desc->l2ptr) return -ENOMEM; - } return 0; } @@ -1229,11 +1419,12 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst, u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V; - /* See comment in arm_smmu_write_ctx_desc() */ + /* The HW has 64 bit atomicity with stores to the L2 CD table */ WRITE_ONCE(*dst, cpu_to_le64(val)); } -static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) +struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, + u32 ssid) { __le64 *l1ptr; unsigned int idx; @@ -1241,8 +1432,13 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; + if (!arm_smmu_cdtab_allocated(&master->cd_table)) { + if (arm_smmu_alloc_cd_tables(master)) + return NULL; + } + if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR) - return cd_table->cdtab + ssid * CTXDESC_CD_DWORDS; + return cd_table->cdtab.linear + ssid; idx = ssid >> CTXDESC_SPLIT; l1_desc = &cd_table->l1_desc[idx]; @@ -1250,114 +1446,138 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc)) return NULL; - l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS; + l1ptr = cd_table->cdtab.l1_desc + idx; arm_smmu_write_cd_l1_desc(l1ptr, l1_desc); /* An invalid L1CD can be cached */ arm_smmu_sync_cd(master, ssid, false); } idx = ssid & (CTXDESC_L2_ENTRIES - 1); - return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS; + return &l1_desc->l2ptr[idx]; } -int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, - struct arm_smmu_ctx_desc *cd) +struct arm_smmu_cd_writer { + struct arm_smmu_entry_writer writer; + unsigned int ssid; +}; + +static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) { - /* - * This function handles the following cases: - * - * (1) Install primary CD, for normal DMA traffic (SSID = IOMMU_NO_PASID = 0). - * (2) Install a secondary CD, for SID+SSID traffic. - * (3) Update ASID of a CD. Atomically write the first 64 bits of the - * CD, then invalidate the old entry and mappings. - * (4) Quiesce the context without clearing the valid bit. Disable - * translation, and ignore any translation fault. - * (5) Remove a secondary CD. - */ - u64 val; - bool cd_live; - __le64 *cdptr; - struct arm_smmu_device *smmu = master->smmu; - struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; + used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); + if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) + return; + memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); + + /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ + if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { + used_bits[0] &= ~cpu_to_le64( + CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | + CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | + CTXDESC_CD_0_TCR_SH0); + used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); + } +} - if (WARN_ON(ssid >= (1 << cd_table->s1cdmax))) - return -E2BIG; +static void arm_smmu_cd_writer_sync_entry(struct arm_smmu_entry_writer *writer) +{ + struct arm_smmu_cd_writer *cd_writer = + container_of(writer, struct arm_smmu_cd_writer, writer); - cdptr = arm_smmu_get_cd_ptr(master, ssid); - if (!cdptr) - return -ENOMEM; + arm_smmu_sync_cd(writer->master, cd_writer->ssid, true); +} - val = le64_to_cpu(cdptr[0]); - cd_live = !!(val & CTXDESC_CD_0_V); - - if (!cd) { /* (5) */ - val = 0; - } else if (cd == &quiet_cd) { /* (4) */ - if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) - val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R); - val |= CTXDESC_CD_0_TCR_EPD0; - } else if (cd_live) { /* (3) */ - val &= ~CTXDESC_CD_0_ASID; - val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid); - /* - * Until CD+TLB invalidation, both ASIDs may be used for tagging - * this substream's traffic - */ - } else { /* (1) and (2) */ - u64 tcr = cd->tcr; +static const struct arm_smmu_entry_writer_ops arm_smmu_cd_writer_ops = { + .sync = arm_smmu_cd_writer_sync_entry, + .get_used = arm_smmu_get_cd_used, + .v_bit = cpu_to_le64(CTXDESC_CD_0_V), + .num_entry_qwords = sizeof(struct arm_smmu_cd) / sizeof(u64), +}; + +void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, + struct arm_smmu_cd *cdptr, + const struct arm_smmu_cd *target) +{ + bool target_valid = target->data[0] & cpu_to_le64(CTXDESC_CD_0_V); + bool cur_valid = cdptr->data[0] & cpu_to_le64(CTXDESC_CD_0_V); + struct arm_smmu_cd_writer cd_writer = { + .writer = { + .ops = &arm_smmu_cd_writer_ops, + .master = master, + }, + .ssid = ssid, + }; + + if (cur_valid != target_valid) { + if (cur_valid) + master->cd_table.used_ssids--; + else + master->cd_table.used_ssids++; + } + if (ssid == IOMMU_NO_PASID) + master->cd_table.used_sid = target_valid; - cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK); - cdptr[2] = 0; - cdptr[3] = cpu_to_le64(cd->mair); + arm_smmu_write_entry(&cd_writer.writer, cdptr->data, target->data); +} - if (!(smmu->features & ARM_SMMU_FEAT_HD)) - tcr &= ~CTXDESC_CD_0_TCR_HD; - if (!(smmu->features & ARM_SMMU_FEAT_HA)) - tcr &= ~CTXDESC_CD_0_TCR_HA; +void arm_smmu_make_s1_cd(struct arm_smmu_cd *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain) +{ + const struct io_pgtable_cfg *pgtbl_cfg = + &io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops)->cfg; + typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = + &pgtbl_cfg->arm_lpae_s1_cfg.tcr; - /* - * STE may be live, and the SMMU might read dwords of this CD in any - * order. Ensure that it observes valid values before reading - * V=1. - */ - arm_smmu_sync_cd(master, ssid, true); + memset(target, 0, sizeof(*target)); - val = tcr | + target->data[0] = cpu_to_le64( + FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) | + FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) | + FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) | + FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) | + FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) | + CTXDESC_CD_0_TCR_EPD1 | #ifdef __BIG_ENDIAN - CTXDESC_CD_0_ENDI | + CTXDESC_CD_0_ENDI | #endif - CTXDESC_CD_0_R | CTXDESC_CD_0_A | - (cd->mm ? 0 : CTXDESC_CD_0_ASET) | - CTXDESC_CD_0_AA64 | - FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) | - CTXDESC_CD_0_V; + CTXDESC_CD_0_V | + FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) | + CTXDESC_CD_0_AA64 | + (master->stall_enabled ? CTXDESC_CD_0_S : 0) | + CTXDESC_CD_0_R | + CTXDESC_CD_0_A | + CTXDESC_CD_0_ASET | + FIELD_PREP(CTXDESC_CD_0_ASID, smmu_domain->asid) + ); - if (cd_table->stall_enabled) - val |= CTXDESC_CD_0_S; - } + if (master->smmu->features & ARM_SMMU_FEAT_HD) + target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HD); + if (master->smmu->features & ARM_SMMU_FEAT_HA) + target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HA); - /* - * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3 - * "Configuration structures and configuration invalidation completion" - * - * The size of single-copy atomic reads made by the SMMU is - * IMPLEMENTATION DEFINED but must be at least 64 bits. Any single - * field within an aligned 64-bit span of a structure can be altered - * without first making the structure invalid. - */ - WRITE_ONCE(cdptr[0], cpu_to_le64(val)); - arm_smmu_sync_cd(master, ssid, true); - return 0; + target->data[1] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.ttbr & + CTXDESC_CD_1_TTB0_MASK); + target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair); +} + +void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid) +{ + struct arm_smmu_cd target = {}; + struct arm_smmu_cd *cdptr; + + if (!arm_smmu_cdtab_allocated(&master->cd_table)) + return; + cdptr = arm_smmu_get_cd_ptr(master, ssid); + if (WARN_ON(!cdptr)) + return; + arm_smmu_write_cd_entry(master, ssid, cdptr, &target); } static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) { - int ret; - size_t l1size; size_t max_contexts; struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - cd_table->stall_enabled = master->stall_enabled; cd_table->s1cdmax = master->ssid_bits; max_contexts = 1 << cd_table->s1cdmax; @@ -1366,37 +1586,34 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR; cd_table->num_l1_ents = max_contexts; - l1size = max_contexts * (CTXDESC_CD_DWORDS << 3); + cd_table->cdtab.linear = dma_alloc_coherent( + smmu->dev, max_contexts * sizeof(struct arm_smmu_cd), + &cd_table->cdtab_dma, GFP_KERNEL); + if (!cd_table->cdtab.linear) + return -ENOMEM; } else { + size_t l1size; + cd_table->s1fmt = STRTAB_STE_0_S1FMT_64K_L2; cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts, CTXDESC_L2_ENTRIES); - cd_table->l1_desc = devm_kcalloc(smmu->dev, cd_table->num_l1_ents, - sizeof(*cd_table->l1_desc), - GFP_KERNEL); + cd_table->l1_desc = kcalloc(cd_table->num_l1_ents, + sizeof(*cd_table->l1_desc), + GFP_KERNEL); if (!cd_table->l1_desc) return -ENOMEM; l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + cd_table->cdtab.l1_desc = dma_alloc_coherent( + smmu->dev, l1size, &cd_table->cdtab_dma, GFP_KERNEL); + if (!cd_table->cdtab.l1_desc) { + kfree(cd_table->l1_desc); + cd_table->l1_desc = NULL; + return -ENOMEM; + } } - - cd_table->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, - GFP_KERNEL); - if (!cd_table->cdtab) { - dev_warn(smmu->dev, "failed to allocate context descriptor\n"); - ret = -ENOMEM; - goto err_free_l1; - } - return 0; - -err_free_l1: - if (cd_table->l1_desc) { - devm_kfree(smmu->dev, cd_table->l1_desc); - cd_table->l1_desc = NULL; - } - return ret; } static void arm_smmu_free_cd_tables(struct arm_smmu_master *master) @@ -1407,43 +1624,28 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master) struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; if (cd_table->l1_desc) { - size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); + size = CTXDESC_L2_ENTRIES * sizeof(struct arm_smmu_cd); for (i = 0; i < cd_table->num_l1_ents; i++) { if (!cd_table->l1_desc[i].l2ptr) continue; - dmam_free_coherent(smmu->dev, size, - cd_table->l1_desc[i].l2ptr, - cd_table->l1_desc[i].l2ptr_dma); + dma_free_coherent(smmu->dev, size, + cd_table->l1_desc[i].l2ptr, + cd_table->l1_desc[i].l2ptr_dma); } - devm_kfree(smmu->dev, cd_table->l1_desc); + kfree(cd_table->l1_desc); cd_table->l1_desc = NULL; l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + dma_free_coherent(smmu->dev, l1size, cd_table->cdtab.l1_desc, + cd_table->cdtab_dma); } else { - l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3); + dma_free_coherent(smmu->dev, + cd_table->num_l1_ents * + sizeof(struct arm_smmu_cd), + cd_table->cdtab.linear, cd_table->cdtab_dma); } - - dmam_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma); - cd_table->cdtab_dma = 0; - cd_table->cdtab = NULL; -} - -bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd) -{ - bool free; - struct arm_smmu_ctx_desc *old_cd; - - if (!cd->asid) - return false; - - free = refcount_dec_and_test(&cd->refs); - if (free) { - old_cd = xa_erase(&arm_smmu_asid_xa, cd->asid); - WARN_ON(old_cd != cd); - } - return free; } /* Stream table manipulation functions */ @@ -1455,175 +1657,217 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc) val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span); val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK; - /* See comment in arm_smmu_write_ctx_desc() */ + /* The HW has 64 bit atomicity with stores to the L2 STE table */ WRITE_ONCE(*dst, cpu_to_le64(val)); } -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) +struct arm_smmu_ste_writer { + struct arm_smmu_entry_writer writer; + u32 sid; +}; + +static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer) { + struct arm_smmu_ste_writer *ste_writer = + container_of(writer, struct arm_smmu_ste_writer, writer); struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_STE, .cfgi = { - .sid = sid, + .sid = ste_writer->sid, .leaf = true, }, }; - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); + arm_smmu_cmdq_issue_cmd_with_sync(writer->master->smmu, &cmd); } -static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, - struct arm_smmu_ste *dst) +static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = { + .sync = arm_smmu_ste_writer_sync_entry, + .get_used = arm_smmu_get_ste_used, + .v_bit = cpu_to_le64(STRTAB_STE_0_V), + .num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64), +}; + +static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, + struct arm_smmu_ste *ste, + const struct arm_smmu_ste *target) { - /* - * This is hideously complicated, but we only really care about - * three cases at the moment: - * - * 1. Invalid (all zero) -> bypass/fault (init) - * 2. Bypass/fault -> translation/bypass (attach) - * 3. Translation/bypass -> bypass/fault (detach) - * - * Given that we can't update the STE atomically and the SMMU - * doesn't read the thing in a defined order, that leaves us - * with the following maintenance requirements: - * - * 1. Update Config, return (init time STEs aren't live) - * 2. Write everything apart from dword 0, sync, write dword 0, sync - * 3. Update Config, sync - */ - u64 val = le64_to_cpu(dst->data[0]); - bool ste_live = false; struct arm_smmu_device *smmu = master->smmu; - struct arm_smmu_ctx_desc_cfg *cd_table = NULL; - struct arm_smmu_s2_cfg *s2_cfg = NULL; - struct arm_smmu_domain *smmu_domain = master->domain; - struct arm_smmu_cmdq_ent prefetch_cmd = { - .opcode = CMDQ_OP_PREFETCH_CFG, - .prefetch = { - .sid = sid, + struct arm_smmu_ste_writer ste_writer = { + .writer = { + .ops = &arm_smmu_ste_writer_ops, + .master = master, }, + .sid = sid, }; - if (smmu_domain) { - switch (smmu_domain->stage) { - case ARM_SMMU_DOMAIN_S1: - cd_table = &master->cd_table; - break; - case ARM_SMMU_DOMAIN_S2: - s2_cfg = &smmu_domain->s2_cfg; - break; - default: - break; - } - } + arm_smmu_write_entry(&ste_writer.writer, ste->data, target->data); - if (val & STRTAB_STE_0_V) { - switch (FIELD_GET(STRTAB_STE_0_CFG, val)) { - case STRTAB_STE_0_CFG_BYPASS: - break; - case STRTAB_STE_0_CFG_S1_TRANS: - case STRTAB_STE_0_CFG_S2_TRANS: - ste_live = true; - break; - case STRTAB_STE_0_CFG_ABORT: - BUG_ON(!disable_bypass); - break; - default: - BUG(); /* STE corruption */ - } - } + /* It's likely that we'll want to use the new STE soon */ + if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) { + struct arm_smmu_cmdq_ent + prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, + .prefetch = { + .sid = sid, + } }; - /* Nuke the existing STE_0 value, as we're going to rewrite it */ - val = STRTAB_STE_0_V; + arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); + } +} - /* Bypass/fault */ - if (!smmu_domain || !(cd_table || s2_cfg)) { - if (!smmu_domain && disable_bypass) - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); - else - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); +void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) +{ + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT)); +} - dst->data[0] = cpu_to_le64(val); - dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, - STRTAB_STE_1_SHCFG_INCOMING)); - dst->data[2] = 0; /* Nuke the VMID */ - /* - * The SMMU can perform negative caching, so we must sync - * the STE regardless of whether the old value was live. - */ - if (smmu) - arm_smmu_sync_ste_for_sid(smmu, sid); - return; - } +void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target) +{ + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS)); + target->data[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); +} - if (cd_table) { - u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? - STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; +void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, bool ats_enabled, + unsigned int s1dss) +{ + struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; + struct arm_smmu_device *smmu = master->smmu; - BUG_ON(ste_live); - dst->data[1] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | - FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | - FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | - FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | - FIELD_PREP(STRTAB_STE_1_STRW, strw)); + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | + FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt) | + (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | + FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax)); + + target->data[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_S1DSS, s1dss) | + FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | + FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | + FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | + ((smmu->features & ARM_SMMU_FEAT_STALLS && + !master->stall_enabled) ? + STRTAB_STE_1_S1STALLD : + 0) | + FIELD_PREP(STRTAB_STE_1_EATS, + ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + + if (s1dss == STRTAB_STE_1_S1DSS_BYPASS) + target->data[1] |= cpu_to_le64(FIELD_PREP( + STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - if (smmu->features & ARM_SMMU_FEAT_STALLS && - !master->stall_enabled) - dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); + if (smmu->features & ARM_SMMU_FEAT_E2H) { + /* + * To support BTM the streamworld needs to match the + * configuration of the CPU so that the ASID broadcasts are + * properly matched. This means either S/NS-EL2-E2H (hypervisor) + * or NS-EL1 (guest). Since an SVA domain can be installed in a + * PASID this should always use a BTM compatible configuration + * if the HW supports it. + */ + target->data[1] |= cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_EL2)); + } else { + target->data[1] |= cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_NSEL1)); - val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | - FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | - FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax) | - FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt); + /* + * VMID 0 is reserved for stage-2 bypass EL1 STEs, see + * arm_smmu_domain_alloc_id() + */ + target->data[2] = + cpu_to_le64(FIELD_PREP(STRTAB_STE_2_S2VMID, 0)); } +} - if (s2_cfg) { - BUG_ON(ste_live); - dst->data[2] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | - FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | +void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, + bool ats_enabled) +{ + const struct io_pgtable_cfg *pgtbl_cfg = + &io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops)->cfg; + typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr = + &pgtbl_cfg->arm_lpae_s2_cfg.vtcr; + u64 vtcr_val; + + memset(target, 0, sizeof(*target)); + target->data[0] = cpu_to_le64( + STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS)); + + target->data[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_EATS, + ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) | + FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); + + vtcr_val = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps); + target->data[2] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_2_S2VMID, smmu_domain->vmid) | + FIELD_PREP(STRTAB_STE_2_VTCR, vtcr_val) | + STRTAB_STE_2_S2AA64 | #ifdef __BIG_ENDIAN - STRTAB_STE_2_S2ENDI | + STRTAB_STE_2_S2ENDI | #endif - STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | - STRTAB_STE_2_S2R); + STRTAB_STE_2_S2PTW | + STRTAB_STE_2_S2R); - dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s2_cfg.vttbr & + STRTAB_STE_3_S2TTB_MASK); +} - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); +static void arm_smmu_make_nested_domain_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + /* + * Userspace can request a non-valid STE through the nesting interface. + * We relay that into a non-valid physical STE with the intention that + * C_BAD_STE for this SID can be delivered to userspace. + */ + if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) { + memset(target, 0, sizeof(*target)); + return; } - if (master->ats_enabled) - dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, - STRTAB_STE_1_EATS_TRANS)); - - arm_smmu_sync_ste_for_sid(smmu, sid); - /* See comment in arm_smmu_write_ctx_desc() */ - WRITE_ONCE(dst->data[0], cpu_to_le64(val)); - arm_smmu_sync_ste_for_sid(smmu, sid); + arm_smmu_make_s2_domain_ste(target, master, nested_domain->s2_parent, + ats_enabled); - /* It's likely that we'll want to use the new STE soon */ - if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) - arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); + target->data[0] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_0_CFG, + STRTAB_STE_0_CFG_NESTED)) | + nested_domain->ste[0]; + target->data[1] |= nested_domain->ste[1]; } -static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, - unsigned int nent, bool force) +/* + * This can safely directly manipulate the STE memory without a sync sequence + * because the STE table has not been installed in the SMMU yet. + */ +static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab, + unsigned int nent) { unsigned int i; - u64 val = STRTAB_STE_0_V; - - if (disable_bypass && !force) - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); - else - val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); for (i = 0; i < nent; ++i) { - strtab->data[0] = cpu_to_le64(val); - strtab->data[1] = cpu_to_le64(FIELD_PREP( - STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - strtab->data[2] = 0; + if (disable_bypass) + arm_smmu_make_abort_ste(strtab); + else + arm_smmu_make_bypass_ste(strtab); strtab++; } } @@ -1638,8 +1882,8 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) if (desc->l2ptr) return 0; - size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3); - strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS]; + size = (1 << STRTAB_SPLIT) * sizeof(struct arm_smmu_ste); + strtab = &cfg->strtab.l1_desc[sid >> STRTAB_SPLIT]; desc->span = STRTAB_SPLIT + 1; desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma, @@ -1651,31 +1895,42 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) return -ENOMEM; } - arm_smmu_init_bypass_stes(desc->l2ptr, 1 << STRTAB_SPLIT, false); + arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT); arm_smmu_write_strtab_l1_desc(strtab, desc); return 0; } +static int arm_smmu_streams_cmp_key(const void *lhs, const struct rb_node *rhs) +{ + struct arm_smmu_stream *stream_rhs = + rb_entry(rhs, struct arm_smmu_stream, node); + const u32 *sid_lhs = lhs; + + if (*sid_lhs < stream_rhs->id) + return -1; + if (*sid_lhs > stream_rhs->id) + return 1; + return 0; +} + +static int arm_smmu_streams_cmp_node(struct rb_node *lhs, + const struct rb_node *rhs) +{ + return arm_smmu_streams_cmp_key( + &rb_entry(lhs, struct arm_smmu_stream, node)->id, rhs); +} + static struct arm_smmu_master * arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) { struct rb_node *node; - struct arm_smmu_stream *stream; lockdep_assert_held(&smmu->streams_mutex); - node = smmu->streams.rb_node; - while (node) { - stream = rb_entry(node, struct arm_smmu_stream, node); - if (stream->id < sid) - node = node->rb_right; - else if (stream->id > sid) - node = node->rb_left; - else - return stream->master; - } - - return NULL; + node = rb_find(&sid, &smmu->streams, arm_smmu_streams_cmp_key); + if (!node) + return NULL; + return rb_entry(node, struct arm_smmu_stream, node)->master; } /* IRQ and event handlers */ @@ -1977,13 +2232,14 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size, cmd->atc.size = log2_span; } -static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) +static int arm_smmu_atc_inv_master(struct arm_smmu_master *master, + ioasid_t ssid) { int i, ret; struct arm_smmu_cmdq_ent cmd; struct arm_smmu_cmdq_batch cmds; - arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd); cmds.num = 0; arm_smmu_preempt_disable(master->smmu); @@ -1997,13 +2253,13 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) return ret; } -int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, +int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, unsigned long iova, size_t size) { int i, ret; + struct arm_smmu_master_domain *master_domain; unsigned long flags; struct arm_smmu_cmdq_ent cmd; - struct arm_smmu_master *master; struct arm_smmu_cmdq_batch cmds; if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) @@ -2026,16 +2282,28 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, if (!atomic_read(&smmu_domain->nr_ats_masters)) return 0; - arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); - cmds.num = 0; arm_smmu_preempt_disable(smmu_domain->smmu); spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + struct arm_smmu_master *master = master_domain->master; + if (!master->ats_enabled) continue; + if (master_domain->nested_parent) { + /* + * If a S2 used as a nesting parent is changed we have + * no option but to completely flush the ATC. + */ + arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + } else { + arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, + &cmd); + } + for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd); @@ -2053,8 +2321,6 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, static void arm_smmu_tlb_inv_context(void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; - struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_cmdq_ent cmd; /* * NOTE: when io-pgtable is in non-strict mode, we may get here with @@ -2063,14 +2329,11 @@ static void arm_smmu_tlb_inv_context(void *cookie) * insertion to guarantee those are observed before the TLBI. Do be * careful, 007. */ - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid); - } else { - cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); - } - arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0); + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) + arm_smmu_tlb_inv_all_s1(smmu_domain); + else + arm_smmu_tlb_inv_all_s2(smmu_domain); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0); } static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, @@ -2146,47 +2409,78 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, arm_smmu_preempt_enable(smmu); } -static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, - size_t granule, bool leaf, - struct arm_smmu_domain *smmu_domain) +static bool arm_smmu_inv_range_too_big(struct arm_smmu_device *smmu, + size_t size, size_t granule) +{ + unsigned int max_ops; + + /* 0 size means invalidate all */ + if (!size || size == SIZE_MAX) + return true; + + if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) + return false; + + /* + * Cloned from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, + * this is used as a threshold to replace per-page TLBI commands to + * issue in the command queue with an address-space TLBI command, when + * SMMU w/o a range invalidation feature handles too many per-page TLBI + * commands, which will otherwise result in a soft lockup. + */ + max_ops = 1 << (ilog2(granule) - 3); + return size >= max_ops * granule; +} + +static void arm_smmu_tlb_inv_range_s2(struct arm_smmu_domain *smmu_domain, + unsigned long iova, size_t size, + size_t granule, bool leaf) { struct arm_smmu_cmdq_ent cmd = { + .opcode = CMDQ_OP_TLBI_S2_IPA, .tlbi = { + .vmid = smmu_domain->vmid, .leaf = leaf, }, }; - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - cmd.opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? - CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA; - cmd.tlbi.asid = smmu_domain->cd.asid; + if (smmu_domain->nesting_parent || + arm_smmu_inv_range_too_big(smmu_domain->smmu, size, granule)) { + cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; + arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); } else { - cmd.opcode = CMDQ_OP_TLBI_S2_IPA; - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; + __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, + smmu_domain); } - __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); +} - /* - * Unfortunately, this can't be leaf-only since we may have - * zapped an entire table. - */ - arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, iova, size); +static void arm_smmu_tlb_inv_all_s2(struct arm_smmu_domain *smmu_domain) +{ + arm_smmu_tlb_inv_range_s2(smmu_domain, 0, 0, PAGE_SIZE, false); } -void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, - size_t granule, bool leaf, - struct arm_smmu_domain *smmu_domain) +void arm_smmu_tlb_inv_range_s1(struct arm_smmu_domain *smmu_domain, + unsigned long iova, size_t size, + size_t granule, bool leaf) { struct arm_smmu_cmdq_ent cmd = { .opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA, .tlbi = { - .asid = asid, + .asid = READ_ONCE(smmu_domain->asid), .leaf = leaf, }, }; - __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); + if (arm_smmu_inv_range_too_big(smmu_domain->smmu, size, granule)) { + cmd.opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? + CMDQ_OP_TLBI_EL2_ASID : + CMDQ_OP_TLBI_NH_ASID, + arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); + } else { + __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, + smmu_domain); + } } static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, @@ -2202,7 +2496,15 @@ static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, size_t granule, void *cookie) { - arm_smmu_tlb_inv_range_domain(iova, size, granule, false, cookie); + struct arm_smmu_domain *smmu_domain = cookie; + + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) + arm_smmu_tlb_inv_range_s1(smmu_domain, iova, size, granule, + false); + else + arm_smmu_tlb_inv_range_s2(smmu_domain, iova, size, granule, + false); + arm_smmu_atc_inv_domain(smmu_domain, iova, size); } static const struct iommu_flush_ops arm_smmu_flush_ops = { @@ -2228,138 +2530,136 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) } } -static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) +static void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) { - struct arm_smmu_domain *smmu_domain; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct iommu_hw_info_arm_smmuv3 *info; + u32 __iomem *base_idr; + unsigned int i; - if (type == IOMMU_DOMAIN_SVA) - return arm_smmu_sva_domain_alloc(); + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); - if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY) - return NULL; + base_idr = master->smmu->base + ARM_SMMU_IDR0; + for (i = 0; i <= 5; i++) + info->idr[i] = readl_relaxed(base_idr + i); + info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR); + info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3; + + return info; +} + +struct arm_smmu_domain *arm_smmu_domain_alloc(void) +{ + struct arm_smmu_domain *smmu_domain; - /* - * Allocate the domain and initialise some of its data structures. - * We can't really do anything meaningful until we've added a - * master. - */ smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL); if (!smmu_domain) - return NULL; + return ERR_PTR(-ENOMEM); mutex_init(&smmu_domain->init_mutex); INIT_LIST_HEAD(&smmu_domain->devices); spin_lock_init(&smmu_domain->devices_lock); - INIT_LIST_HEAD(&smmu_domain->mmu_notifiers); - return &smmu_domain->domain; + return smmu_domain; } -static void arm_smmu_domain_free(struct iommu_domain *domain) +static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_domain *smmu_domain; - free_io_pgtable_ops(smmu_domain->pgtbl_ops); + smmu_domain = arm_smmu_domain_alloc(); + if (IS_ERR(smmu_domain)) + return ERR_CAST(smmu_domain); - /* Free the ASID or VMID */ - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - /* Prevent SVA from touching the CD while we're freeing it */ - mutex_lock(&arm_smmu_asid_lock); - arm_smmu_free_asid(&smmu_domain->cd); - mutex_unlock(&arm_smmu_asid_lock); - } else { - struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; - if (cfg->vmid) - ida_free(&smmu->vmid_map, cfg->vmid); + /* + * Allocate the domain and initialise some of its data structures. + * We can't really do anything meaningful until we've added a + * master. + */ + + if (dev) { + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + int ret; + + ret = arm_smmu_domain_finalise(smmu_domain, master->smmu); + if (ret) { + kfree(smmu_domain); + return ERR_PTR(ret); + } } + return &smmu_domain->domain; +} - kfree(smmu_domain); +int arm_smmu_domain_alloc_id(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain) +{ + if ((smmu_domain->stage == ARM_SMMU_DOMAIN_S1 || + smmu_domain->domain.type == IOMMU_DOMAIN_SVA)) { + return xa_alloc(&smmu->asid_map, &smmu_domain->asid, + smmu_domain, + XA_LIMIT(1, (1 << smmu->asid_bits) - 1), + GFP_KERNEL); + } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2) { + int vmid; + + /* Reserve VMID 0 for stage-2 bypass STEs */ + vmid = ida_alloc_range(&smmu->vmid_map, 1, + (1 << smmu->vmid_bits) - 1, GFP_KERNEL); + if (vmid < 0) + return vmid; + smmu_domain->vmid = vmid; + return 0; + } + WARN_ON(true); + return -EINVAL; } -static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, - struct io_pgtable_cfg *pgtbl_cfg) +/* + * Return the domain's ASID or VMID back to the allocator. All IDs in the + * allocator do not have an IOTLB entries referencing them. + */ +void arm_smmu_domain_free_id(struct arm_smmu_domain *smmu_domain) { - int ret; - u32 asid; struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_ctx_desc *cd = &smmu_domain->cd; - typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr; - - refcount_set(&cd->refs, 1); - - /* Prevent SVA from modifying the ASID until it is written to the CD */ - mutex_lock(&arm_smmu_asid_lock); - ret = xa_alloc(&arm_smmu_asid_xa, &asid, cd, - XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); - if (ret) - goto out_unlock; - cd->asid = (u16)asid; - cd->ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr; - cd->tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) | - FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) | - FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) | - FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) | - FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) | - FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) | - CTXDESC_CD_0_TCR_HA | CTXDESC_CD_0_TCR_HD | - CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64; - cd->mair = pgtbl_cfg->arm_lpae_s1_cfg.mair; - - mutex_unlock(&arm_smmu_asid_lock); - return 0; + if ((smmu_domain->stage == ARM_SMMU_DOMAIN_S1 || + smmu_domain->domain.type == IOMMU_DOMAIN_SVA) && + smmu_domain->asid) { + arm_smmu_tlb_inv_all_s1(smmu_domain); -out_unlock: - mutex_unlock(&arm_smmu_asid_lock); - return ret; + /* Prevent SVA from touching the CD while we're freeing it */ + mutex_lock(&smmu->asid_lock); + xa_erase(&smmu->asid_map, smmu_domain->asid); + mutex_unlock(&smmu->asid_lock); + } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 && + smmu_domain->vmid) { + arm_smmu_tlb_inv_all_s2(smmu_domain); + ida_free(&smmu->vmid_map, smmu_domain->vmid); + } } -static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, - struct io_pgtable_cfg *pgtbl_cfg) +static void arm_smmu_domain_free(struct iommu_domain *domain) { - int vmid; - struct arm_smmu_device *smmu = smmu_domain->smmu; - struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; - typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr; - - /* Reserve VMID 0 for stage-2 bypass STEs */ - vmid = ida_alloc_range(&smmu->vmid_map, 1, (1 << smmu->vmid_bits) - 1, - GFP_KERNEL); - if (vmid < 0) - return vmid; - - vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr; - cfg->vmid = (u16)vmid; - cfg->vttbr = pgtbl_cfg->arm_lpae_s2_cfg.vttbr; - cfg->vtcr = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) | - FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps); - return 0; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + + free_io_pgtable_ops(smmu_domain->pgtbl_ops); + arm_smmu_domain_free_id(smmu_domain); + kfree(smmu_domain); } -static int arm_smmu_domain_finalise(struct iommu_domain *domain) +static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_device *smmu) { int ret; unsigned long ias, oas; enum io_pgtable_fmt fmt; struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; - int (*finalise_stage_fn)(struct arm_smmu_domain *, - struct io_pgtable_cfg *); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_device *smmu = smmu_domain->smmu; - - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS; - return 0; - } /* Restrict the stage to what we can actually support */ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) @@ -2373,13 +2673,11 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) ias = min_t(unsigned long, ias, VA_BITS); oas = smmu->ias; fmt = ARM_64_LPAE_S1; - finalise_stage_fn = arm_smmu_domain_finalise_s1; break; case ARM_SMMU_DOMAIN_S2: ias = smmu->ias; oas = smmu->oas; fmt = ARM_64_LPAE_S2; - finalise_stage_fn = arm_smmu_domain_finalise_s2; break; default: return -EINVAL; @@ -2406,17 +2704,18 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) if (!pgtbl_ops) return -ENOMEM; - domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; - domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; - domain->geometry.force_aperture = true; + smmu_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; + smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; + smmu_domain->domain.geometry.force_aperture = true; - ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); + ret = arm_smmu_domain_alloc_id(smmu, smmu_domain); if (ret < 0) { free_io_pgtable_ops(pgtbl_ops); return ret; } smmu_domain->pgtbl_ops = pgtbl_ops; + smmu_domain->smmu = smmu; return 0; } @@ -2434,16 +2733,23 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) return &cfg->l1_desc[idx1].l2ptr[idx2]; } else { /* Simple linear lookup */ - return (struct arm_smmu_ste *)&cfg - ->strtab[sid * STRTAB_STE_DWORDS]; + return &cfg->strtab.linear[sid]; } } -static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) +static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target) { int i, j; struct arm_smmu_device *smmu = master->smmu; + master->cd_table.in_ste = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(target->data[0])) == + STRTAB_STE_0_CFG_S1_TRANS; + master->ste_ats_enabled = + FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(target->data[1])) == + STRTAB_STE_1_EATS_TRANS; + for (i = 0; i < master->num_streams; ++i) { u32 sid = master->streams[i].id; struct arm_smmu_ste *step = @@ -2456,227 +2762,889 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) if (j < i) continue; - arm_smmu_write_strtab_ent(master, sid, step); + arm_smmu_write_ste(master, sid, step, target); + } +} + +static bool arm_smmu_ats_supported(struct arm_smmu_master *master) +{ + struct device *dev = master->dev; + struct arm_smmu_device *smmu = master->smmu; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + if (!(smmu->features & ARM_SMMU_FEAT_ATS)) + return false; + + if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS)) + return false; + + return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)); +} + +static void arm_smmu_enable_ats(struct arm_smmu_master *master) +{ + size_t stu; + struct pci_dev *pdev; + struct arm_smmu_device *smmu = master->smmu; + + /* Smallest Translation Unit: log2 of the smallest supported granule */ + stu = __ffs(smmu->pgsize_bitmap); + pdev = to_pci_dev(master->dev); + + /* + * ATC invalidation of PASID 0 causes the entire ATC to be flushed. + */ + arm_smmu_atc_inv_master(master, IOMMU_NO_PASID); + if (pci_enable_ats(pdev, stu)) + dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); +} + +static int arm_smmu_enable_pasid(struct arm_smmu_master *master) +{ + int ret; + int features; + int num_pasids; + struct pci_dev *pdev; + + if (!dev_is_pci(master->dev)) + return -ENODEV; + + pdev = to_pci_dev(master->dev); + + features = pci_pasid_features(pdev); + if (features < 0) + return features; + + num_pasids = pci_max_pasids(pdev); + if (num_pasids <= 0) + return num_pasids; + + ret = pci_enable_pasid(pdev, features); + if (ret) { + dev_err(&pdev->dev, "Failed to enable PASID\n"); + return ret; + } + + master->ssid_bits = min_t(u8, ilog2(num_pasids), + master->smmu->ssid_bits); + return 0; +} + +static void arm_smmu_disable_pasid(struct arm_smmu_master *master) +{ + struct pci_dev *pdev; + + if (!dev_is_pci(master->dev)) + return; + + pdev = to_pci_dev(master->dev); + + if (!pdev->pasid_enabled) + return; + + master->ssid_bits = 0; + pci_disable_pasid(pdev); +} + +static struct arm_smmu_master_domain * +arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_master *master, + ioasid_t ssid) +{ + struct arm_smmu_master_domain *master_domain; + + lockdep_assert_held(&smmu_domain->devices_lock); + + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + if (master_domain->master == master && + master_domain->ssid == ssid) + return master_domain; + } + return NULL; +} + +/* + * If the domain uses the smmu_domain->devices list return the arm_smmu_domain + * structure, otherwise NULL. These domains track attached devices so they can + * issue invalidations. + */ +static struct arm_smmu_domain * +to_smmu_domain_devices(struct iommu_domain *domain) +{ + /* The domain can be NULL only when processing the first attach */ + if (!domain) + return NULL; + if ((domain->type & __IOMMU_DOMAIN_PAGING) || + domain->type == IOMMU_DOMAIN_SVA) + return to_smmu_domain(domain); + if (domain->type == IOMMU_DOMAIN_NESTED) + return container_of(domain, struct arm_smmu_nested_domain, + domain)->s2_parent; + return NULL; +} + +static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, + struct iommu_domain *domain, + ioasid_t ssid) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; + + if (!smmu_domain) + return; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid); + if (master_domain) { + list_del(&master_domain->devices_elm); + kfree(master_domain); + if (master->ats_enabled) + atomic_dec(&smmu_domain->nr_ats_masters); + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); +} + +struct attach_state { + bool want_ats; + bool disable_ats; + ioasid_t ssid; +}; + +/* + * Prepare to attach a domain to a master. If disable_ats is not set this will + * turn on ATS if supported. smmu_domain can be NULL if the domain being + * attached does not have a page table and does not require invalidation + * tracking. + */ +static int arm_smmu_attach_prepare(struct arm_smmu_master *master, + struct iommu_domain *domain, + struct attach_state *state) +{ + struct arm_smmu_domain *smmu_domain = + to_smmu_domain_devices(domain); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; + + /* + * arm_smmu_share_asid() must not see two domains pointing to the same + * arm_smmu_master_domain contents otherwise it could randomly write one + * or the other to the CD. + */ + lockdep_assert_held(&master->smmu->asid_lock); + + state->want_ats = !state->disable_ats && arm_smmu_ats_supported(master); + + if (smmu_domain) { + master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL); + if (!master_domain) + return -ENOMEM; + master_domain->master = master; + master_domain->ssid = state->ssid; + master_domain->nested_parent = domain->type == + IOMMU_DOMAIN_NESTED; + + /* + * During prepare we want the current smmu_domain and new + * smmu_domain to be in the devices list before we change any + * HW. This ensures that both domains will send ATS + * invalidations to the master until we are done. + * + * It is tempting to make this list only track masters that are + * using ATS, but arm_smmu_share_asid() also uses this to change + * the ASID of a domain, unrelated to ATS. + * + * Notice if we are re-attaching the same domain then the list + * will have two identical entries and commit will remove only + * one of them. + */ + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + if (state->want_ats) + atomic_inc(&smmu_domain->nr_ats_masters); + list_add(&master_domain->devices_elm, &smmu_domain->devices); + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + } + + if (!state->want_ats && master->ats_enabled) { + pci_disable_ats(to_pci_dev(master->dev)); + /* + * This is probably overkill, but the config write for disabling + * ATS should complete before the STE is configured to generate + * UR to avoid AER noise. + */ + wmb(); + } + return 0; +} + +/* + * Commit is done after the STE/CD are configured with the EATS setting. It + * completes synchronizing the PCI device's ATC and finishes manipulating the + * smmu_domain->devices list. + */ +static void arm_smmu_attach_commit(struct arm_smmu_master *master, + struct attach_state *state) +{ + lockdep_assert_held(&master->smmu->asid_lock); + + if (state->want_ats && !master->ats_enabled) { + arm_smmu_enable_ats(master); + } else if (master->ats_enabled) { + /* + * The translation has changed, flush the ATC. At this point the + * SMMU is translating for the new domain and both the old&new + * domain will issue invalidations. + */ + if (state->want_ats) + arm_smmu_atc_inv_master(master, state->ssid); + else + arm_smmu_atc_inv_master(master, IOMMU_NO_PASID); + } + master->ats_enabled = state->want_ats; + + arm_smmu_remove_master_domain( + master, iommu_get_domain_for_dev(master->dev), state->ssid); +} + +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +{ + int ret = 0; + struct arm_smmu_ste target; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct arm_smmu_device *smmu; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct attach_state state = {.ssid = IOMMU_NO_PASID}; + struct arm_smmu_master *master; + struct arm_smmu_cd *cdptr; + + if (!fwspec) + return -ENOENT; + + master = dev_iommu_priv_get(dev); + smmu = master->smmu; + + mutex_lock(&smmu_domain->init_mutex); + + if (!smmu_domain->smmu) { + ret = arm_smmu_domain_finalise(smmu_domain, smmu); + } else if (smmu_domain->smmu != smmu) + ret = -EINVAL; + + mutex_unlock(&smmu_domain->init_mutex); + if (ret) + return ret; + + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { + cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID); + if (!cdptr) + return -ENOMEM; + } else if (arm_smmu_ssids_in_use(&master->cd_table)) + return -EBUSY; + + /* + * Prevent arm_smmu_share_asid() from trying to change the ASID + * of either the old or new domain while we are working on it. + * This allows the STE and the smmu_domain->devices list to + * be inconsistent during this routine. + */ + mutex_lock(&smmu->asid_lock); + + ret = arm_smmu_attach_prepare(master, domain, &state); + if (ret) { + mutex_unlock(&smmu->asid_lock); + return ret; + } + + switch (smmu_domain->stage) { + case ARM_SMMU_DOMAIN_S1: { + struct arm_smmu_cd target_cd; + + arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); + arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr, + &target_cd); + arm_smmu_make_cdtable_ste(&target, master, state.want_ats, + STRTAB_STE_1_S1DSS_SSID0); + arm_smmu_install_ste_for_dev(master, &target); + break; + } + case ARM_SMMU_DOMAIN_S2: + arm_smmu_make_s2_domain_ste(&target, master, smmu_domain, + state.want_ats); + arm_smmu_install_ste_for_dev(master, &target); + arm_smmu_clear_cd(master, IOMMU_NO_PASID); + break; + } + + arm_smmu_attach_commit(master, &state); + mutex_unlock(&smmu->asid_lock); + return 0; +} + +static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_cd target_cd; + int ret = 0; + + mutex_lock(&smmu_domain->init_mutex); + if (!smmu_domain->smmu) + ret = arm_smmu_domain_finalise(smmu_domain, smmu); + else if (smmu_domain->smmu != smmu) + ret = -EINVAL; + mutex_unlock(&smmu_domain->init_mutex); + if (ret) + return ret; + + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + return -EINVAL; + + /* + * We can read cd.asid outside the lock because arm_smmu_set_pasid() + * will fix it + */ + arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); + return arm_smmu_set_pasid(master, to_smmu_domain(domain), id, + &target_cd); +} + +static void arm_smmu_update_ste(struct arm_smmu_master *master, + struct iommu_domain *sid_domain, + bool want_ats) +{ + unsigned int s1dss = STRTAB_STE_1_S1DSS_TERMINATE; + struct arm_smmu_ste ste; + + if (master->cd_table.in_ste && master->ste_ats_enabled == want_ats) + return; + + if (sid_domain->type == IOMMU_DOMAIN_IDENTITY) + s1dss = STRTAB_STE_1_S1DSS_BYPASS; + else + WARN_ON(sid_domain->type != IOMMU_DOMAIN_BLOCKED); + + /* + * Change the STE into a cdtable one with SID IDENTITY/BLOCKED behavior + * using s1dss if necessary. The cd_table is already installed then + * the S1DSS is correct and this will just update the EATS. Otherwise + * it installs the entire thing. This will be hitless. + */ + arm_smmu_make_cdtable_ste(&ste, master, want_ats, s1dss); + arm_smmu_install_ste_for_dev(master, &ste); +} + +int arm_smmu_set_pasid(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, ioasid_t pasid, + struct arm_smmu_cd *cd) +{ + struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); + struct attach_state state = {.ssid = pasid}; + struct arm_smmu_cd *cdptr; + int ret; + + if (smmu_domain->smmu != master->smmu || pasid == IOMMU_NO_PASID) + return -EINVAL; + + if (!master->cd_table.in_ste && + sid_domain->type != IOMMU_DOMAIN_IDENTITY && + sid_domain->type != IOMMU_DOMAIN_BLOCKED) + return -EINVAL; + + cdptr = arm_smmu_get_cd_ptr(master, pasid); + if (!cdptr) + return -ENOMEM; + + mutex_lock(&master->smmu->asid_lock); + ret = arm_smmu_attach_prepare(master, &smmu_domain->domain, &state); + if (ret) + goto out_unlock; + + /* + * We don't want to obtain to the asid_lock too early, so fix up the + * caller set ASID under the lock in case it changed. + */ + cd->data[0] &= ~cpu_to_le64(CTXDESC_CD_0_ASID); + cd->data[0] |= + cpu_to_le64(FIELD_PREP(CTXDESC_CD_0_ASID, smmu_domain->asid)); + + arm_smmu_write_cd_entry(master, pasid, cdptr, cd); + arm_smmu_update_ste(master, sid_domain, state.want_ats); + + arm_smmu_attach_commit(master, &state); + +out_unlock: + mutex_unlock(&master->smmu->asid_lock); + return ret; +} + +static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_domain *smmu_domain; + struct iommu_domain *domain; + bool last_ssid = master->cd_table.used_ssids == 1; + + domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); + if (WARN_ON(IS_ERR(domain)) || !domain) + return; + + smmu_domain = to_smmu_domain(domain); + + mutex_lock(&master->smmu->asid_lock); + arm_smmu_clear_cd(master, pasid); + if (master->ats_enabled) + arm_smmu_atc_inv_master(master, pasid); + arm_smmu_remove_master_domain(master, &smmu_domain->domain, pasid); + mutex_unlock(&master->smmu->asid_lock); + + /* + * When the last user of the CD table goes away downgrade the STE back + * to a non-cd_table one. + */ + if (last_ssid && !master->cd_table.used_sid) { + struct iommu_domain *sid_domain = + iommu_get_domain_for_dev(master->dev); + + sid_domain->ops->attach_dev(sid_domain, master->dev); } } -static bool arm_smmu_ats_supported(struct arm_smmu_master *master) +static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, + struct device *dev, + struct arm_smmu_ste *ste, + unsigned int s1dss) { - struct device *dev = master->dev; - struct arm_smmu_device *smmu = master->smmu; - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct attach_state state = {.ssid = IOMMU_NO_PASID}; - if (!(smmu->features & ARM_SMMU_FEAT_ATS)) - return false; + /* + * Do not allow any ASID to be changed while are working on the STE, + * otherwise we could miss invalidations. + */ + mutex_lock(&master->smmu->asid_lock); - if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS)) - return false; + /* + * If the CD table is not in use we can use the provided STE, otherwise + * we use a cdtable STE with the provided S1DSS. + */ + if (arm_smmu_ssids_in_use(&master->cd_table)) { + /* + * If a CD table has to be present then we need to run with ATS + * on even though the RID will fail ATS queries with UR. This is + * because we have no idea what the PASID's need. + */ + arm_smmu_attach_prepare(master, domain, &state); + arm_smmu_make_cdtable_ste(ste, master, state.want_ats, s1dss); + } else { + /* + * The SMMU does not support enabling ATS with bypass/abort. + * When the STE is in bypass (STE.Config[2:0] == 0b100), ATS + * Translation Requests and Translated transactions are denied + * as though ATS is disabled for the stream (STE.EATS == 0b00), + * causing F_BAD_ATS_TREQ and F_TRANSL_FORBIDDEN events + * (IHI0070Ea 5.2 Stream Table Entry). + */ + state.disable_ats = true; + arm_smmu_attach_prepare(master, domain, &state); + } + arm_smmu_install_ste_for_dev(master, ste); + arm_smmu_attach_commit(master, &state); + mutex_unlock(&master->smmu->asid_lock); - return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)); + /* + * This has to be done after removing the master from the + * arm_smmu_domain->devices to avoid races updating the same context + * descriptor from arm_smmu_share_asid(). + */ + arm_smmu_clear_cd(master, IOMMU_NO_PASID); } -static void arm_smmu_enable_ats(struct arm_smmu_master *master) +static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) { - size_t stu; - struct pci_dev *pdev; - struct arm_smmu_device *smmu = master->smmu; - struct arm_smmu_domain *smmu_domain = master->domain; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_ste ste; - /* Don't enable ATS at the endpoint if it's not enabled in the STE */ - if (!master->ats_enabled) - return; + arm_smmu_make_bypass_ste(&ste); + arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); + if (dev_is_pci(master->dev) && master->ssid_bits && !master->ats_enabled) { + master->ats_enabled = true; + arm_smmu_enable_ats(master); + } + return 0; +} - /* Smallest Translation Unit: log2 of the smallest supported granule */ - stu = __ffs(smmu->pgsize_bitmap); - pdev = to_pci_dev(master->dev); +static const struct iommu_domain_ops arm_smmu_identity_ops = { + .attach_dev = arm_smmu_attach_dev_identity, +}; - atomic_inc(&smmu_domain->nr_ats_masters); - arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0); - if (pci_enable_ats(pdev, stu)) - dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); +static struct iommu_domain arm_smmu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &arm_smmu_identity_ops, +}; + +static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_ste ste; + + arm_smmu_make_abort_ste(&ste); + arm_smmu_attach_dev_ste(domain, dev, &ste, + STRTAB_STE_1_S1DSS_TERMINATE); + return 0; } -static void arm_smmu_disable_ats(struct arm_smmu_master *master) +static const struct iommu_domain_ops arm_smmu_blocked_ops = { + .attach_dev = arm_smmu_attach_dev_blocked, +}; + +static struct iommu_domain arm_smmu_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &arm_smmu_blocked_ops, +}; + +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, + struct device *dev) { - struct arm_smmu_domain *smmu_domain = master->domain; + struct arm_smmu_nested_domain *nested_domain = + container_of(domain, struct arm_smmu_nested_domain, domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct attach_state state = {.ssid = IOMMU_NO_PASID}; + struct arm_smmu_ste ste; + int ret; - if (!master->ats_enabled) - return; + if (arm_smmu_ssids_in_use(&master->cd_table) || + nested_domain->s2_parent->smmu != master->smmu) + return -EINVAL; - pci_disable_ats(to_pci_dev(master->dev)); + mutex_lock(&master->smmu->asid_lock); /* - * Ensure ATS is disabled at the endpoint before we issue the - * ATC invalidation via the SMMU. + * The VM has to control the actual ATS state at the PCI device because + * we forward the invalidations directly from the VM. If the VM doesn't + * think ATS is on it will not generate ATC flushes and the ATC will + * become incoherent. Since we can't access the actual virtual PCI ATS + * config bit here base this off the EATS value in the STE. If the EATS + * is set then the VM must generate ATC flushes. */ - wmb(); - arm_smmu_atc_inv_master(master); - atomic_dec(&smmu_domain->nr_ats_masters); + state.disable_ats = !nested_domain->enable_ats; + ret = arm_smmu_attach_prepare(master, domain, &state); + if (ret) { + mutex_unlock(&master->smmu->asid_lock); + return ret; + } + arm_smmu_make_nested_domain_ste(&ste, master, nested_domain, + state.want_ats); + arm_smmu_install_ste_for_dev(master, &ste); + arm_smmu_attach_commit(master, &state); + mutex_unlock(&master->smmu->asid_lock); + return 0; } -static int arm_smmu_enable_pasid(struct arm_smmu_master *master) +static void arm_smmu_domain_nested_free(struct iommu_domain *domain) { - int ret; - int features; - int num_pasids; - struct pci_dev *pdev; + struct arm_smmu_nested_domain *nested_domain = + container_of(domain, struct arm_smmu_nested_domain, domain); + u32 id_user = nested_domain->sid_user; - if (!dev_is_pci(master->dev)) - return -ENODEV; + if (!WARN_ON(!id_user)) { + struct arm_smmu_device *smmu = nested_domain->s2_parent->smmu; + struct arm_smmu_stream *stream; - pdev = to_pci_dev(master->dev); + xa_lock(&smmu->streams_user); + stream = __xa_erase(&smmu->streams_user, id_user); + /* + * A mismatched id_user is unlikely to happen, but restore the + * stream back to the xarray for its actual owner to carray on. + */ + if (unlikely(stream->id_user != id_user)) { + WARN_ON(__xa_alloc(&smmu->streams_user, &id_user, + stream, XA_LIMIT(id_user, id_user), + GFP_KERNEL_ACCOUNT)); + } else + stream->id_user = 0; + xa_unlock(&smmu->streams_user); + } - features = pci_pasid_features(pdev); - if (features < 0) - return features; + kfree(nested_domain); +} - num_pasids = pci_max_pasids(pdev); - if (num_pasids <= 0) - return num_pasids; +static struct iommu_domain * +arm_smmu_get_msi_mapping_domain(struct iommu_domain *domain) +{ + struct arm_smmu_nested_domain *nested_domain = + container_of(domain, struct arm_smmu_nested_domain, domain); - ret = pci_enable_pasid(pdev, features); - if (ret) { - dev_err(&pdev->dev, "Failed to enable PASID\n"); - return ret; + return &nested_domain->s2_parent->domain; +} + +static int arm_smmu_fix_user_cmd(struct arm_smmu_nested_domain *nested_domain, + u64 *cmd) +{ + struct arm_smmu_device *smmu = nested_domain->s2_parent->smmu; + struct arm_smmu_stream *stream; + + cmd[0] = le64_to_cpu(cmd[0]); + cmd[1] = le64_to_cpu(cmd[1]); + + switch (*cmd & CMDQ_0_OP) { + case CMDQ_OP_TLBI_NSNH_ALL: + *cmd &= ~CMDQ_0_OP; + *cmd |= CMDQ_OP_TLBI_NH_ALL; + fallthrough; + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_TLBI_NH_VAA: + case CMDQ_OP_TLBI_NH_ALL: + case CMDQ_OP_TLBI_NH_ASID: + *cmd &= ~CMDQ_TLBI_0_VMID; + *cmd |= FIELD_PREP(CMDQ_TLBI_0_VMID, + nested_domain->s2_parent->vmid); + break; + case CMDQ_OP_ATC_INV: + case CMDQ_OP_CFGI_CD: + case CMDQ_OP_CFGI_CD_ALL: + xa_lock(&smmu->streams_user); + stream = xa_load(&smmu->streams_user, + FIELD_GET(CMDQ_CFGI_0_SID, *cmd)); + xa_unlock(&smmu->streams_user); + if (!stream) + return -ENODEV; + *cmd &= ~CMDQ_CFGI_0_SID; + *cmd |= FIELD_PREP(CMDQ_CFGI_0_SID, stream->id); + break; + default: + return -EINVAL; } + pr_debug("Fixed user CMD: %016llx : %016llx\n", cmd[1], cmd[0]); - master->ssid_bits = min_t(u8, ilog2(num_pasids), - master->smmu->ssid_bits); return 0; } -static void arm_smmu_disable_pasid(struct arm_smmu_master *master) +/* + * A helper function to detect if the current cmd hits Arm erratum before being + * added to a batch that might already contain a leaf TLBI or a CFGI command. + * + * If there is a hit, it will update the corresponding boolean flag, by assuming + * that the caller must issue the batch without this cmd. So, the two flags then + * will reflect the status of the new batch that contains the current cmd only. + */ +static bool arm_smmu_cmdq_hit_errata(struct arm_smmu_device *smmu, u64 *cmd, + bool *batch_has_leaf, bool *batch_has_cfgi) { - struct pci_dev *pdev; + bool cmd_has_leaf; - if (!dev_is_pci(master->dev)) - return; - - pdev = to_pci_dev(master->dev); + if (!(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) + return false; - if (!pdev->pasid_enabled) - return; + switch (*cmd & CMDQ_0_OP) { + case CMDQ_OP_CFGI_CD: + case CMDQ_OP_CFGI_CD_ALL: + *batch_has_cfgi = true; + break; + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_TLBI_NH_VAA: + cmd_has_leaf = FIELD_GET(CMDQ_TLBI_1_LEAF, cmd[1]); + /* Erratum 2812531 -- a non-leaf tlbi following a leaf tlbi */ + if (!cmd_has_leaf && *batch_has_leaf) { + *batch_has_leaf = false; + return true; + } + if (cmd_has_leaf) + *batch_has_leaf = true; + fallthrough; + case CMDQ_OP_TLBI_NH_ALL: + case CMDQ_OP_TLBI_NH_ASID: + /* Erratum 2812531 -- a tlbi following a cfgi */ + if (*batch_has_cfgi) { + *batch_has_cfgi = false; + return true; + } + break; + } - master->ssid_bits = 0; - pci_disable_pasid(pdev); + return false; } -static void arm_smmu_detach_dev(struct arm_smmu_master *master) +static int arm_smmu_cache_invalidate_user(struct iommu_domain *domain, + struct iommu_user_data_array *array) { - unsigned long flags; - struct arm_smmu_domain *smmu_domain = master->domain; + struct arm_smmu_nested_domain *nested_domain = + container_of(domain, struct arm_smmu_nested_domain, domain); + struct arm_smmu_device *smmu = nested_domain->s2_parent->smmu; + bool has_leaf = false, has_cfgi = false; + int data_idx, n = 0, ret; + u64 *cmds; - if (!smmu_domain) - return; - - arm_smmu_disable_ats(master); + if (!smmu) + return -EINVAL; + if (!array->entry_num) + return -EINVAL; - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_del(&master->domain_head); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + cmds = kcalloc(array->entry_num, sizeof(*cmds) * 2, GFP_KERNEL); + if (!cmds) + return -ENOMEM; - master->domain = NULL; - master->ats_enabled = false; - arm_smmu_install_ste_for_dev(master); - /* - * Clearing the CD entry isn't strictly required to detach the domain - * since the table is uninstalled anyway, but it helps avoid confusion - * in the call to arm_smmu_write_ctx_desc on the next attach (which - * expects the entry to be empty). - */ - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->cd_table.cdtab) - arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL); -} + for (data_idx = 0; data_idx < array->entry_num; data_idx++) { + struct iommu_hwpt_arm_smmuv3_invalidate *inv = + (struct iommu_hwpt_arm_smmuv3_invalidate *)&cmds[n * 2]; -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) -{ - int ret = 0; - unsigned long flags; - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct arm_smmu_device *smmu; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct arm_smmu_master *master; + ret = iommu_copy_struct_from_user_array(inv, array, + IOMMU_HWPT_DATA_ARM_SMMUV3, + data_idx, cmd); + if (ret) + goto out; - if (!fwspec) - return -ENOENT; + ret = arm_smmu_fix_user_cmd(nested_domain, inv->cmd); + if (ret) + goto out; - master = dev_iommu_priv_get(dev); - smmu = master->smmu; + if (arm_smmu_cmdq_hit_errata(smmu, inv->cmd, &has_leaf, &has_cfgi)) { + /* WAR is to issue the batch prior, with a CMD_SYNC */ + ret = arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, true); + if (ret) + goto out; + /* Then move the cmd to the head for a new batch */ + cmds[1] = cmds[n * 2 + 1]; + cmds[0] = cmds[n * 2]; + n = 1; + continue; + } - /* - * Checking that SVA is disabled ensures that this device isn't bound to - * any mm, and can be safely detached from its old domain. Bonds cannot - * be removed concurrently since we're holding the group mutex. - */ - if (arm_smmu_master_sva_enabled(master)) { - dev_err(dev, "cannot attach - SVA enabled\n"); - return -EBUSY; + if (++n == CMDQ_BATCH_ENTRIES - 1) { + ret = arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, true); + if (ret) + goto out; + n = 0; + } } - arm_smmu_detach_dev(master); + if (n) + ret = arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, true); +out: + array->entry_num = data_idx; + kfree(cmds); + return ret; +} + +static const struct iommu_domain_ops arm_smmu_nested_ops = { + .attach_dev = arm_smmu_attach_dev_nested, + .free = arm_smmu_domain_nested_free, + .get_msi_mapping_domain = arm_smmu_get_msi_mapping_domain, + .cache_invalidate_user = arm_smmu_cache_invalidate_user, +}; - mutex_lock(&smmu_domain->init_mutex); +static struct iommu_domain * +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_nested_domain *nested_domain; + struct arm_smmu_domain *smmu_parent; + struct iommu_hwpt_arm_smmuv3 arg; + unsigned int eats; + int ret; - if (!smmu_domain->smmu) { - smmu_domain->smmu = smmu; - ret = arm_smmu_domain_finalise(domain); - if (ret) - smmu_domain->smmu = NULL; - } else if (smmu_domain->smmu != smmu) - ret = -EINVAL; + if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) + return ERR_PTR(-EOPNOTSUPP); - mutex_unlock(&smmu_domain->init_mutex); + if (master->streams[0].id_user) + return ERR_PTR(-EEXIST); + + ret = iommu_copy_struct_from_user(&arg, user_data, + IOMMU_HWPT_DATA_ARM_SMMUV3, ste); if (ret) - return ret; + return ERR_PTR(ret); - master->domain = smmu_domain; + if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1)) + return ERR_PTR(-EOPNOTSUPP); - /* - * The SMMU does not support enabling ATS with bypass. When the STE is - * in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests and - * Translated transactions are denied as though ATS is disabled for the - * stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and - * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry). - */ - if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) - master->ats_enabled = arm_smmu_ats_supported(master); + if (!(parent->type & __IOMMU_DOMAIN_PAGING)) + return ERR_PTR(-EINVAL); - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_add(&master->domain_head, &smmu_domain->devices); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + smmu_parent = to_smmu_domain(parent); + if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 || + smmu_parent->smmu != master->smmu) + return ERR_PTR(-EINVAL); - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - if (!master->cd_table.cdtab) { - ret = arm_smmu_alloc_cd_tables(master); - if (ret) { - master->domain = NULL; - goto out_list_del; - } - } + /* EIO is reserved for invalid STE data. */ + if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) || + (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED)) + return ERR_PTR(-EIO); - /* - * Prevent SVA from concurrently modifying the CD or writing to - * the CD entry - */ - mutex_lock(&arm_smmu_asid_lock); - ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd); - mutex_unlock(&arm_smmu_asid_lock); - if (ret) { - master->domain = NULL; - goto out_list_del; - } + /* Only Full ATS or ATS UR is supported */ + eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1])); + if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS) + return ERR_PTR(-EIO); + + nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT); + if (!nested_domain) + return ERR_PTR(-ENOMEM); + + ret = xa_alloc(&smmu_parent->smmu->streams_user, &arg.sid, + &master->streams[0], XA_LIMIT(arg.sid, arg.sid), + GFP_KERNEL_ACCOUNT); + if (ret) { + kfree(nested_domain); + return ERR_PTR(ret); } + master->streams[0].id_user = arg.sid; - arm_smmu_install_ste_for_dev(master); + nested_domain->domain.type = IOMMU_DOMAIN_NESTED; + nested_domain->domain.ops = &arm_smmu_nested_ops; + nested_domain->s2_parent = smmu_parent; + nested_domain->enable_ats = eats == STRTAB_STE_1_EATS_TRANS; + nested_domain->ste[0] = arg.ste[0]; + nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS); + nested_domain->sid_user = arg.sid; - arm_smmu_enable_ats(master); - return 0; + return &nested_domain->domain; +} -out_list_del: - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_del(&master->domain_head); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); +static struct iommu_domain * +arm_smmu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_NEST_PARENT; + struct arm_smmu_domain *smmu_domain; + int ret; - return ret; + if (parent) + return arm_smmu_domain_alloc_nesting(dev, flags, parent, + user_data); + + if (flags & ~PAGING_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + if (user_data) + return ERR_PTR(-EINVAL); + + smmu_domain = arm_smmu_domain_alloc(); + if (!smmu_domain) + return ERR_PTR(-ENOMEM); + + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) { + if (!(master->smmu->features & ARM_SMMU_FEAT_TRANS_S2)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + smmu_domain->nesting_parent = true; + } + + smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; + smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops; + ret = arm_smmu_domain_finalise(smmu_domain, master->smmu); + if (ret) + goto err_free; + return &smmu_domain->domain; + +err_free: + kfree(smmu_domain); + return ERR_PTR(ret); } static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, @@ -2716,13 +3684,24 @@ static void arm_smmu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + size_t size = gather->end - gather->start + 1; if (!gather->pgsize) return; - arm_smmu_tlb_inv_range_domain(gather->start, - gather->end - gather->start + 1, - gather->pgsize, true, smmu_domain); + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) + arm_smmu_tlb_inv_range_s1(smmu_domain, gather->start, size, + gather->pgsize, true); + + else + arm_smmu_tlb_inv_range_s2(smmu_domain, gather->start, size, + gather->pgsize, true); + + /* + * Unfortunately, this can't be leaf-only since we may have + * zapped an entire table. + */ + arm_smmu_atc_inv_domain(smmu_domain, gather->start, size); } #ifdef CONFIG_HISILICON_ERRATUM_162100602 @@ -2738,7 +3717,10 @@ static int arm_smmu_iotlb_sync_map(struct iommu_domain *domain, granule_size = 1 << __ffs(smmu_domain->domain.pgsize_bitmap); /* Add a SYNC command to sync io-pgtale to avoid errors in pgtable prefetch*/ - arm_smmu_tlb_inv_range_domain(iova, granule_size, granule_size, true, smmu_domain); + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) + arm_smmu_tlb_inv_range_s1(smmu_domain, iova, granule_size, granule_size, true); + else + arm_smmu_tlb_inv_range_s2(smmu_domain, iova, granule_size, granule_size, true); return 0; } @@ -2794,8 +3776,6 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu, { int i; int ret = 0; - struct arm_smmu_stream *new_stream, *cur_stream; - struct rb_node **new_node, *parent_node = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); master->streams = kcalloc(fwspec->num_ids, sizeof(*master->streams), @@ -2806,9 +3786,9 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu, mutex_lock(&smmu->streams_mutex); for (i = 0; i < fwspec->num_ids; i++) { + struct arm_smmu_stream *new_stream = &master->streams[i]; u32 sid = fwspec->ids[i]; - new_stream = &master->streams[i]; new_stream->id = sid; new_stream->master = master; @@ -2817,28 +3797,13 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu, break; /* Insert into SID tree */ - new_node = &(smmu->streams.rb_node); - while (*new_node) { - cur_stream = rb_entry(*new_node, struct arm_smmu_stream, - node); - parent_node = *new_node; - if (cur_stream->id > new_stream->id) { - new_node = &((*new_node)->rb_left); - } else if (cur_stream->id < new_stream->id) { - new_node = &((*new_node)->rb_right); - } else { - dev_warn(master->dev, - "stream %u already in tree\n", - cur_stream->id); - ret = -EINVAL; - break; - } - } - if (ret) + if (rb_find_add(&new_stream->node, &smmu->streams, + arm_smmu_streams_cmp_node)) { + dev_warn(master->dev, "stream %u already in tree\n", + sid); + ret = -EINVAL; break; - - rb_link_node(&new_stream->node, parent_node, new_node); - rb_insert_color(&new_stream->node, &smmu->streams); + } } if (ret) { @@ -2868,8 +3833,6 @@ static void arm_smmu_remove_master(struct arm_smmu_master *master) kfree(master->streams); } -static struct iommu_ops arm_smmu_ops; - static struct iommu_device *arm_smmu_probe_device(struct device *dev) { int ret; @@ -2890,7 +3853,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) master->dev = dev; master->smmu = smmu; - INIT_LIST_HEAD(&master->bonds); dev_iommu_priv_set(dev, master); ret = arm_smmu_insert_master(smmu, master); @@ -2932,10 +3894,16 @@ static void arm_smmu_release_device(struct device *dev) if (WARN_ON(arm_smmu_master_sva_enabled(master))) iopf_queue_remove_device(master->smmu->evtq.iopf, dev); - arm_smmu_detach_dev(master); + + /* Put the STE back to what arm_smmu_init_strtab() sets */ + if (disable_bypass && !dev->iommu->require_direct) + arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); + else + arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); + arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); - if (master->cd_table.cdtab) + if (arm_smmu_cdtab_allocated(&master->cd_table)) arm_smmu_free_cd_tables(master); kfree(master); } @@ -2957,21 +3925,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - #ifdef CONFIG_ARM_SMMU_V3_HTTU static int arm_smmu_split_block(struct iommu_domain *domain, unsigned long iova, size_t size) @@ -3273,20 +4226,14 @@ static int arm_smmu_def_domain_type(struct device *dev) return ret; } -static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) -{ - struct iommu_domain *domain; - - domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA); - if (WARN_ON(IS_ERR(domain)) || !domain) - return; - - arm_smmu_sva_remove_dev_pasid(domain, dev, pasid); -} - static struct iommu_ops arm_smmu_ops = { + .identity_domain = &arm_smmu_identity_domain, + .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc = arm_smmu_domain_alloc, + .hw_info = arm_smmu_hw_info, + .domain_alloc_paging = arm_smmu_domain_alloc_paging, + .domain_alloc_user = arm_smmu_domain_alloc_user, + .domain_alloc_sva = arm_smmu_sva_domain_alloc, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .device_group = arm_smmu_device_group, @@ -3301,6 +4248,7 @@ static struct iommu_ops arm_smmu_ops = { .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = arm_smmu_attach_dev, + .set_dev_pasid = arm_smmu_s1_set_dev_pasid, .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, @@ -3309,7 +4257,6 @@ static struct iommu_ops arm_smmu_ops = { .iotlb_sync_map = arm_smmu_iotlb_sync_map, #endif .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, #ifdef CONFIG_ARM_SMMU_V3_HTTU .support_dirty_log = arm_smmu_support_dirty_log, .switch_dirty_log = arm_smmu_switch_dirty_log, @@ -3441,17 +4388,15 @@ static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu) { unsigned int i; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; - void *strtab = smmu->strtab_cfg.strtab; cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents, sizeof(*cfg->l1_desc), GFP_KERNEL); if (!cfg->l1_desc) return -ENOMEM; - for (i = 0; i < cfg->num_l1_ents; ++i) { - arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]); - strtab += STRTAB_L1_DESC_DWORDS << 3; - } + for (i = 0; i < cfg->num_l1_ents; ++i) + arm_smmu_write_strtab_l1_desc( + &smmu->strtab_cfg.strtab.l1_desc[i], &cfg->l1_desc[i]); return 0; } @@ -3527,7 +4472,7 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu) l1size); return -ENOMEM; } - cfg->strtab = strtab; + cfg->strtab.l1_desc = strtab; /* Configure strtab_base_cfg for 2 levels */ reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL); @@ -3551,7 +4496,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) u32 size; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; - size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3); + size = (1 << smmu->sid_bits) * sizeof(cfg->strtab.linear[0]); strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma, GFP_KERNEL); if (!strtab) { @@ -3560,7 +4505,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) size); return -ENOMEM; } - cfg->strtab = strtab; + cfg->strtab.linear = strtab; cfg->num_l1_ents = 1 << smmu->sid_bits; /* Configure strtab_base_cfg for a linear table covering all SIDs */ @@ -3568,7 +4513,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); cfg->strtab_base_cfg = reg; - arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents, false); + arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents); return 0; } @@ -3591,6 +4536,8 @@ static int arm_smmu_init_strtab(struct arm_smmu_device *smmu) smmu->strtab_cfg.strtab_base = reg; ida_init(&smmu->vmid_map); + xa_init_flags(&smmu->asid_map, XA_FLAGS_ALLOC1); + mutex_init(&smmu->asid_lock); return 0; } @@ -3601,6 +4548,7 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu) mutex_init(&smmu->streams_mutex); smmu->streams = RB_ROOT; + xa_init_flags(&smmu->streams_user, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); ret = arm_smmu_init_queues(smmu); if (ret) @@ -4658,7 +5606,6 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) iort_get_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); list_for_each_entry(e, &rmr_list, list) { - struct arm_smmu_ste *step; struct iommu_iort_rmr_data *rmr; int ret, i; @@ -4671,8 +5618,12 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) continue; } - step = arm_smmu_get_step_for_sid(smmu, rmr->sids[i]); - arm_smmu_init_bypass_stes(step, 1, true); + /* + * STE table is not programmed to HW, see + * arm_smmu_initial_bypass_stes() + */ + arm_smmu_make_bypass_ste( + arm_smmu_get_step_for_sid(smmu, rmr->sids[i])); } } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 859d2172988251042645b9cd107c141e0e7e7e47..c68779303b0dd33b56c94e1e54959242dbfa1f9b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -84,6 +84,8 @@ #define IIDR_REVISION GENMASK(15, 12) #define IIDR_IMPLEMENTER GENMASK(11, 0) +#define ARM_SMMU_AIDR 0x1C + #define ARM_SMMU_CR0 0x20 #define CR0_ATSCHK (1 << 4) #define CR0_CMDQEN (1 << 3) @@ -235,10 +237,8 @@ #define STRTAB_L1_DESC_SPAN GENMASK_ULL(4, 0) #define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6) -#define STRTAB_STE_DWORDS 8 - struct arm_smmu_ste { - __le64 data[STRTAB_STE_DWORDS]; + __le64 data[8]; }; #define STRTAB_STE_0_V (1UL << 0) @@ -247,6 +247,7 @@ struct arm_smmu_ste { #define STRTAB_STE_0_CFG_BYPASS 4 #define STRTAB_STE_0_CFG_S1_TRANS 5 #define STRTAB_STE_0_CFG_S2_TRANS 6 +#define STRTAB_STE_0_CFG_NESTED 7 #define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 @@ -297,6 +298,15 @@ struct arm_smmu_ste { #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ +#define STRTAB_STE_0_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_S1FMT | \ + STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX) +#define STRTAB_STE_1_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \ + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \ + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS) + /* * Context descriptors. * @@ -311,7 +321,10 @@ struct arm_smmu_ste { #define CTXDESC_L1_DESC_V (1UL << 0) #define CTXDESC_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 12) -#define CTXDESC_CD_DWORDS 8 +struct arm_smmu_cd { + __le64 data[8]; +}; + #define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0) #define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6) #define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8) @@ -342,7 +355,7 @@ struct arm_smmu_ste { * When the SMMU only supports linear context descriptor tables, pick a * reasonable size limit (64kB). */ -#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3)) +#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / sizeof(struct arm_smmu_cd)) /* Command queue */ #define CMDQ_ENT_SZ_SHIFT 4 @@ -495,8 +508,10 @@ struct arm_smmu_cmdq_ent { }; } cfgi; + #define CMDQ_OP_TLBI_NH_ALL 0x10 #define CMDQ_OP_TLBI_NH_ASID 0x11 #define CMDQ_OP_TLBI_NH_VA 0x12 + #define CMDQ_OP_TLBI_NH_VAA 0x13 #define CMDQ_OP_TLBI_EL2_ALL 0x20 #define CMDQ_OP_TLBI_EL2_ASID 0x21 #define CMDQ_OP_TLBI_EL2_VA 0x22 @@ -627,41 +642,46 @@ struct arm_smmu_strtab_l1_desc { dma_addr_t l2ptr_dma; }; -struct arm_smmu_ctx_desc { - u16 asid; - u64 ttbr; - u64 tcr; - u64 mair; - - refcount_t refs; - struct mm_struct *mm; -}; - struct arm_smmu_l1_ctx_desc { - __le64 *l2ptr; + struct arm_smmu_cd *l2ptr; dma_addr_t l2ptr_dma; }; struct arm_smmu_ctx_desc_cfg { - __le64 *cdtab; + union { + struct arm_smmu_cd *linear; + __le64 *l1_desc; + } cdtab; dma_addr_t cdtab_dma; struct arm_smmu_l1_ctx_desc *l1_desc; unsigned int num_l1_ents; + unsigned int used_ssids; + u8 used_sid; + u8 in_ste; u8 s1fmt; /* log2 of the maximum number of CDs supported by this table */ u8 s1cdmax; - /* Whether CD entries in this table have the stall bit set. */ - u8 stall_enabled:1; }; -struct arm_smmu_s2_cfg { - u16 vmid; - u64 vttbr; - u64 vtcr; -}; +static inline bool +arm_smmu_cdtab_allocated(struct arm_smmu_ctx_desc_cfg *cd_table) +{ + return cd_table->cdtab.linear; +} + +/* True if the cd table has SSIDS > 0 in use. */ +static inline bool arm_smmu_ssids_in_use(struct arm_smmu_ctx_desc_cfg *cd_table) +{ + if (cd_table->used_sid) + return cd_table->used_ssids > 1; + return cd_table->used_ssids; +} struct arm_smmu_strtab_cfg { - __le64 *strtab; + union { + struct arm_smmu_ste *linear; + __le64 *l1_desc; + } strtab; dma_addr_t strtab_dma; struct arm_smmu_strtab_l1_desc *l1_desc; unsigned int num_l1_ents; @@ -733,6 +753,8 @@ struct arm_smmu_device { #define ARM_SMMU_MAX_ASIDS (1 << 16) unsigned int asid_bits; + struct xarray asid_map; + struct mutex asid_lock; #define ARM_SMMU_MAX_VMIDS (1 << 16) unsigned int vmid_bits; @@ -750,10 +772,12 @@ struct arm_smmu_device { struct mutex streams_mutex; bool bypass; + struct xarray streams_user; }; struct arm_smmu_stream { u32 id; + u32 id_user; struct arm_smmu_master *master; struct rb_node node; }; @@ -762,17 +786,15 @@ struct arm_smmu_stream { struct arm_smmu_master { struct arm_smmu_device *smmu; struct device *dev; - struct arm_smmu_domain *domain; - struct list_head domain_head; struct arm_smmu_stream *streams; /* Locked by the iommu core using the group mutex */ struct arm_smmu_ctx_desc_cfg cd_table; unsigned int num_streams; - bool ats_enabled; + bool ats_enabled : 1; + bool ste_ats_enabled : 1; bool stall_enabled; bool sva_enabled; bool iopf_enabled; - struct list_head bonds; unsigned int ssid_bits; }; @@ -780,7 +802,6 @@ struct arm_smmu_master { enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, - ARM_SMMU_DOMAIN_BYPASS, }; struct arm_smmu_domain { @@ -792,35 +813,99 @@ struct arm_smmu_domain { enum arm_smmu_domain_stage stage; union { - struct arm_smmu_ctx_desc cd; - struct arm_smmu_s2_cfg s2_cfg; + u32 asid; + u16 vmid; }; struct iommu_domain domain; + /* List of struct arm_smmu_master_domain */ struct list_head devices; spinlock_t devices_lock; - struct list_head mmu_notifiers; + struct mmu_notifier mmu_notifier; + bool btm_invalidation : 1; + bool nesting_parent : 1; }; +struct arm_smmu_nested_domain { + struct iommu_domain domain; + struct arm_smmu_domain *s2_parent; + u8 enable_ats : 1; + + __le64 ste[2]; + u32 sid_user; +}; + +struct arm_smmu_master_domain { + struct list_head devices_elm; + struct arm_smmu_master *master; + u16 ssid; + u8 nested_parent; +}; + +/* The following are exposed for testing purposes. */ +struct arm_smmu_entry_writer_ops; +struct arm_smmu_entry_writer { + const struct arm_smmu_entry_writer_ops *ops; + struct arm_smmu_master *master; +}; + +struct arm_smmu_entry_writer_ops { + unsigned int num_entry_qwords; + __le64 v_bit; + void (*get_used)(const __le64 *entry, __le64 *used); + void (*sync)(struct arm_smmu_entry_writer *writer); +}; + +void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); +void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, + const __le64 *target); + +void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); +void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target); +void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, bool ats_enabled, + unsigned int s1dss); +void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, + bool ats_enabled); + static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) { return container_of(dom, struct arm_smmu_domain, domain); } -extern struct xarray arm_smmu_asid_xa; -extern struct mutex arm_smmu_asid_lock; -extern struct arm_smmu_ctx_desc quiet_cd; - -int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid, - struct arm_smmu_ctx_desc *cd); -void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); -void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, - size_t granule, bool leaf, - struct arm_smmu_domain *smmu_domain); -bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd); -int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, +struct arm_smmu_domain *arm_smmu_domain_alloc(void); + +void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid); +struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, + u32 ssid); +void arm_smmu_make_s1_cd(struct arm_smmu_cd *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain); +void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, + struct arm_smmu_cd *cdptr, + const struct arm_smmu_cd *target); + +int arm_smmu_set_pasid(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, ioasid_t pasid, + struct arm_smmu_cd *cd); +void arm_smmu_remove_pasid(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, ioasid_t pasid); + +int arm_smmu_domain_alloc_id(struct arm_smmu_device *smmu, + struct arm_smmu_domain *smmu_domain); +void arm_smmu_domain_free_id(struct arm_smmu_domain *smmu_domain); +void arm_smmu_tlb_inv_range_s1(struct arm_smmu_domain *smmu_domain, + unsigned long iova, size_t size, size_t granule, + bool leaf); +static inline void arm_smmu_tlb_inv_all_s1(struct arm_smmu_domain *smmu_domain) +{ + arm_smmu_tlb_inv_range_s1(smmu_domain, 0, 0, PAGE_SIZE, false); +} +int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, unsigned long iova, size_t size); #ifdef CONFIG_ARM_SMMU_V3_SVA @@ -831,9 +916,8 @@ int arm_smmu_master_enable_sva(struct arm_smmu_master *master); int arm_smmu_master_disable_sva(struct arm_smmu_master *master); bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master); void arm_smmu_sva_notifier_synchronize(void); -struct iommu_domain *arm_smmu_sva_domain_alloc(void); -void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id); +struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); #else /* CONFIG_ARM_SMMU_V3_SVA */ static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { @@ -877,5 +961,8 @@ static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, ioasid_t id) { } + +#define arm_smmu_sva_domain_alloc NULL + #endif /* CONFIG_ARM_SMMU_V3_SVA */ #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 2b57ae0e2166c8fd0de7ad34a92203f7fdc744f1..29a68dacd56b5c1451c3b50e2cabbea9ff2deab3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1538,21 +1538,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { @@ -1635,7 +1620,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .set_pgtable_quirks = arm_smmu_set_pgtable_quirks, .free = arm_smmu_domain_free, } diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 037fcf826407f4868517ab8841e47e816652a762..f6d904cdcf97bdec518ec733a344c45394822b19 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1796,6 +1796,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } +/* + * Nested domains may not have an MSI cookie or accept mappings, but they may + * be related to a domain which does, so we let them tell us what they need. + */ +static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + + if (domain && domain->type == IOMMU_DOMAIN_NESTED && + domain->ops->get_msi_mapping_domain) + domain = domain->ops->get_msi_mapping_domain(domain); + return domain; +} + /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain * @desc: MSI descriptor, will store the MSI page @@ -1806,7 +1820,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */ @@ -1839,7 +1853,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); const struct iommu_dma_msi_page *msi_page; msi_page = msi_desc_get_iommu_cookie(desc); diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 06d78fcc79fdb6818af10d37b3a23f8cb0ae6b31..9a8074d8abf034af01f7ad2bffa60d71058fd6b9 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -39,6 +39,103 @@ static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param) kfree_rcu(fault_param, rcu); } +/* Get the domain attachment cookie for pasid of a device. */ +static struct iopf_attach_cookie * +iopf_pasid_cookie_get(struct device *dev, ioasid_t pasid) +{ + struct iommu_fault_param *iopf_param = iopf_get_dev_fault_param(dev); + struct iopf_attach_cookie *curr; + + if (!iopf_param) + return ERR_PTR(-ENODEV); + + xa_lock(&iopf_param->pasid_cookie); + curr = xa_load(&iopf_param->pasid_cookie, pasid); + if (curr && !refcount_inc_not_zero(&curr->users)) + curr = ERR_PTR(-EINVAL); + xa_unlock(&iopf_param->pasid_cookie); + + iopf_put_dev_fault_param(iopf_param); + + return curr; +} + +/* Put the domain attachment cookie. */ +static void iopf_pasid_cookie_put(struct iopf_attach_cookie *cookie) +{ + if (cookie && refcount_dec_and_test(&cookie->users)) + cookie->release(cookie); +} + +/* + * Set the domain attachment cookie for pasid of a device. Return 0 on + * success, or error number on failure. + */ +static int iopf_pasid_cookie_set(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid, struct iopf_attach_cookie *cookie) +{ + struct iommu_fault_param *iopf_param = iopf_get_dev_fault_param(dev); + struct iopf_attach_cookie *curr; + + if (!iopf_param) + return -ENODEV; + + refcount_set(&cookie->users, 1); + cookie->dev = dev; + cookie->pasid = pasid; + cookie->domain = domain; + + curr = xa_cmpxchg(&iopf_param->pasid_cookie, pasid, NULL, cookie, GFP_KERNEL); + iopf_put_dev_fault_param(iopf_param); + + return curr ? xa_err(curr) : 0; +} + +/* Clear the domain attachment cookie for pasid of a device. */ +static void iopf_pasid_cookie_clear(struct device *dev, ioasid_t pasid) +{ + struct iommu_fault_param *iopf_param = iopf_get_dev_fault_param(dev); + struct iopf_attach_cookie *curr; + + if (WARN_ON(!iopf_param)) + return; + + curr = xa_erase(&iopf_param->pasid_cookie, pasid); + /* paired with iopf_pasid_cookie_set/replace() */ + iopf_pasid_cookie_put(curr); + + iopf_put_dev_fault_param(iopf_param); +} + +/* Replace the domain attachment cookie for pasid of a device. */ +static int iopf_pasid_cookie_replace(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid, struct iopf_attach_cookie *cookie) +{ + struct iommu_fault_param *iopf_param = iopf_get_dev_fault_param(dev); + struct iopf_attach_cookie *curr; + + if (!iopf_param) + return -ENODEV; + + if (cookie) { + refcount_set(&cookie->users, 1); + cookie->dev = dev; + cookie->pasid = pasid; + cookie->domain = domain; + } + + curr = xa_store(&iopf_param->pasid_cookie, pasid, cookie, GFP_KERNEL); + if (xa_err(curr)) + return xa_err(curr); + + /* paired with iopf_pasid_cookie_set/replace() */ + iopf_pasid_cookie_put(curr); + + iopf_put_dev_fault_param(iopf_param); + + return 0; +} + static void __iopf_free_group(struct iopf_group *group) { struct iopf_fault *iopf, *next; @@ -50,6 +147,7 @@ static void __iopf_free_group(struct iopf_group *group) /* Pair with iommu_report_device_fault(). */ iopf_put_dev_fault_param(group->fault_param); + iopf_pasid_cookie_put(group->cookie); } void iopf_free_group(struct iopf_group *group) @@ -59,30 +157,6 @@ void iopf_free_group(struct iopf_group *group) } EXPORT_SYMBOL_GPL(iopf_free_group); -static struct iommu_domain *get_domain_for_iopf(struct device *dev, - struct iommu_fault *fault) -{ - struct iommu_domain *domain; - - if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { - domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0); - if (IS_ERR(domain)) - domain = NULL; - } else { - domain = iommu_get_domain_for_dev(dev); - } - - if (!domain || !domain->iopf_handler) { - dev_warn_ratelimited(dev, - "iopf (pasid %d) without domain attached or handler installed\n", - fault->prm.pasid); - - return NULL; - } - - return domain; -} - /* Non-last request of a group. Postpone until the last one. */ static int report_partial_fault(struct iommu_fault_param *fault_param, struct iommu_fault *fault) @@ -102,10 +176,20 @@ static int report_partial_fault(struct iommu_fault_param *fault_param, return 0; } +static ioasid_t fault_to_pasid(struct iommu_fault *fault) +{ + if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) + return fault->prm.pasid; + + return IOMMU_NO_PASID; +} + static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, struct iopf_fault *evt, struct iopf_group *abort_group) { + ioasid_t pasid = fault_to_pasid(&evt->fault); + struct iopf_attach_cookie *cookie; struct iopf_fault *iopf, *next; struct iopf_group *group; @@ -118,7 +202,23 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, group = abort_group; } + cookie = iopf_pasid_cookie_get(iopf_param->dev, pasid); + if (!cookie && pasid != IOMMU_NO_PASID) + cookie = iopf_pasid_cookie_get(iopf_param->dev, IOMMU_NO_PASID); + if (IS_ERR(cookie) || !cookie) { + /* + * The PASID of this device was not attached by an I/O-capable + * domain. Ask the caller to abort handling of this fault. + * Otherwise, the reference count will be switched to the new + * iopf group and will be released in iopf_free_group(). + */ + kfree(group); + group = abort_group; + cookie = NULL; + } + group->fault_param = iopf_param; + group->cookie = cookie; group->last_fault.fault = evt->fault; INIT_LIST_HEAD(&group->faults); INIT_LIST_HEAD(&group->pending_node); @@ -206,15 +306,11 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) if (group == &abort_group) goto err_abort; - group->domain = get_domain_for_iopf(dev, fault); - if (!group->domain) - goto err_abort; - /* * On success iopf_handler must call iopf_group_response() and * iopf_free_group() */ - if (group->domain->iopf_handler(group)) + if (group->cookie->domain->iopf_handler(group)) goto err_abort; return; @@ -354,6 +450,7 @@ int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) mutex_init(&fault_param->lock); INIT_LIST_HEAD(&fault_param->faults); INIT_LIST_HEAD(&fault_param->partial); + xa_init(&fault_param->pasid_cookie); fault_param->dev = dev; refcount_set(&fault_param->users, 1); list_add(&fault_param->queue_list, &queue->devices); @@ -491,3 +588,63 @@ void iopf_queue_free(struct iopf_queue *queue) kfree(queue); } EXPORT_SYMBOL_GPL(iopf_queue_free); + +int iopf_domain_attach(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid, struct iopf_attach_cookie *cookie) +{ + int ret; + + if (!domain->iopf_handler) + return -EINVAL; + + if (pasid == IOMMU_NO_PASID) + ret = iommu_attach_group(domain, dev->iommu_group); + else + ret = iommu_attach_device_pasid(domain, dev, pasid); + if (ret) + return ret; + + ret = iopf_pasid_cookie_set(domain, dev, pasid, cookie); + if (ret) { + if (pasid == IOMMU_NO_PASID) + iommu_detach_group(domain, dev->iommu_group); + else + iommu_detach_device_pasid(domain, dev, pasid); + } + + return ret; +} +EXPORT_SYMBOL_GPL(iopf_domain_attach); + +void iopf_domain_detach(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) +{ + iopf_pasid_cookie_clear(dev, pasid); + + if (pasid == IOMMU_NO_PASID) + iommu_detach_group(domain, dev->iommu_group); + else + iommu_detach_device_pasid(domain, dev, pasid); +} +EXPORT_SYMBOL_GPL(iopf_domain_detach); + +int iopf_domain_replace(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid, struct iopf_attach_cookie *cookie) +{ + struct iommu_domain *old_domain = iommu_get_domain_for_dev(dev); + int ret; + + if (!old_domain || pasid != IOMMU_NO_PASID || + (!old_domain->iopf_handler && !domain->iopf_handler)) + return -EINVAL; + + ret = iommu_group_replace_domain(dev->iommu_group, domain); + if (ret) + return ret; + + ret = iopf_pasid_cookie_replace(domain, dev, pasid, cookie); + if (ret) + iommu_group_replace_domain(dev->iommu_group, old_domain); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iopf_domain_replace, IOMMUFD_INTERNAL); diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 640acc804e8cdc1c3151df2178108dbf8856deba..7ed57667d6ef2a41a142aba57890b1e0a75f64b7 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -51,6 +51,39 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return iommu_mm; } +static void release_attach_cookie(struct iopf_attach_cookie *cookie) +{ + struct iommu_domain *domain = cookie->domain; + + mutex_lock(&iommu_sva_lock); + if (--domain->users == 0) { + list_del(&domain->next); + iommu_domain_free(domain); + } + mutex_unlock(&iommu_sva_lock); + + kfree(cookie); +} + +static int sva_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct iopf_attach_cookie *cookie; + int ret; + + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); + if (!cookie) + return -ENOMEM; + + cookie->release = release_attach_cookie; + + ret = iopf_domain_attach(domain, dev, pasid, cookie); + if (ret) + kfree(cookie); + + return ret; +} + /** * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device @@ -99,7 +132,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm /* Search for an existing domain. */ list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) { - ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); + ret = sva_attach_device_pasid(domain, dev, iommu_mm->pasid); if (!ret) { domain->users++; goto out; @@ -108,12 +141,12 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm /* Allocate a new domain and set it on device pasid. */ domain = iommu_sva_domain_alloc(dev, mm); - if (!domain) { - ret = -ENOMEM; + if (IS_ERR(domain)) { + ret = PTR_ERR(domain); goto out_free_handle; } - ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); + ret = sva_attach_device_pasid(domain, dev, iommu_mm->pasid); if (ret) goto out_free_domain; domain->users = 1; @@ -157,13 +190,8 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) return; } list_del(&handle->handle_item); - - iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); - if (--domain->users == 0) { - list_del(&domain->next); - iommu_domain_free(domain); - } mutex_unlock(&iommu_sva_lock); + kfree(handle); } EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); @@ -259,7 +287,8 @@ static void iommu_sva_handle_iopf(struct work_struct *work) if (status != IOMMU_PAGE_RESP_SUCCESS) break; - status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm); + status = iommu_sva_handle_mm(&iopf->fault, + group->cookie->domain->mm); } iopf_group_response(group, status); @@ -283,9 +312,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; - domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); - if (!domain) - return NULL; + if (ops->domain_alloc_sva) { + domain = ops->domain_alloc_sva(dev, mm); + if (IS_ERR(domain)) + return domain; + } else { + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return ERR_PTR(-ENOMEM); + } domain->type = IOMMU_DOMAIN_SVA; mmgrab(mm); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index cf1646026d2d67b46c530483b5d18d6376acd4b1..ac7a4dc8a6d3afbae7c72c718e1b8453173adb3e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2678,16 +2678,6 @@ static int __init iommu_init(void) } core_initcall(iommu_init); -int iommu_enable_nesting(struct iommu_domain *domain) -{ - if (domain->type != IOMMU_DOMAIN_UNMANAGED) - return -EINVAL; - if (!domain->ops->enable_nesting) - return -EINVAL; - return domain->ops->enable_nesting(domain); -} -EXPORT_SYMBOL_GPL(iommu_enable_nesting); - int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 34b446146961c29e7b24dc5cc890a5aa557a6ce8..b94a74366eedb36afcf9c94461095a39eafac14c 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -6,6 +6,7 @@ iommufd-y := \ ioas.o \ main.o \ pages.o \ + fault.o \ vfio_compat.o iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 873630c111c1fc70a5297f101a67a83090f7f9ff..c4737e876ebc211fa7a74a70f0c6f65af7673f33 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -215,6 +215,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, refcount_inc(&idev->obj.users); /* igroup refcount moves into iommufd_device */ idev->igroup = igroup; + xa_init(&idev->faults); /* * If the caller fails after this success it must call @@ -376,7 +377,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); + if (hwpt->fault_capable) + rc = iommufd_fault_domain_attach_dev(hwpt, idev); + else + rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; idev->igroup->hwpt = hwpt; @@ -402,7 +406,10 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) mutex_lock(&idev->igroup->lock); list_del(&idev->group_item); if (list_empty(&idev->igroup->device_list)) { - iommu_detach_group(hwpt->domain, idev->igroup->group); + if (hwpt->fault_capable) + iommufd_fault_domain_detach_dev(hwpt, idev); + else + iommu_detach_group(hwpt->domain, idev->igroup->group); idev->igroup->hwpt = NULL; } if (hwpt_is_paging(hwpt)) @@ -497,7 +504,10 @@ iommufd_device_do_replace(struct iommufd_device *idev, goto err_unlock; } - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); + if (old_hwpt->fault_capable || hwpt->fault_capable) + rc = iommufd_fault_domain_replace_dev(hwpt, idev); + else + rc = iommu_group_replace_domain(igroup->group, hwpt->domain); if (rc) goto err_unresv; diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c new file mode 100644 index 0000000000000000000000000000000000000000..a4a49f3cd4c296996898fd5dcb320fb232cb9fd0 --- /dev/null +++ b/drivers/iommu/iommufd/fault.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iommufd_private.h" + +static int device_add_fault(struct iopf_group *group) +{ + struct iommufd_device *idev = group->cookie->private; + void *curr; + + curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid, + NULL, group, GFP_KERNEL); + + return curr ? xa_err(curr) : 0; +} + +static void device_remove_fault(struct iopf_group *group) +{ + struct iommufd_device *idev = group->cookie->private; + + xa_store(&idev->faults, group->last_fault.fault.prm.grpid, + NULL, GFP_KERNEL); +} + +static struct iopf_group *device_get_fault(struct iommufd_device *idev, + unsigned long grpid) +{ + return xa_load(&idev->faults, grpid); +} + +void iommufd_fault_destroy(struct iommufd_object *obj) +{ + struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); + struct iopf_group *group, *next; + + mutex_lock(&fault->mutex); + list_for_each_entry_safe(group, next, &fault->deliver, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + list_for_each_entry_safe(group, next, &fault->response, node) { + list_del(&group->node); + device_remove_fault(group); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + mutex_unlock(&fault->mutex); + + mutex_destroy(&fault->mutex); +} + +static void iommufd_compose_fault_message(struct iommu_fault *fault, + struct iommu_hwpt_pgfault *hwpt_fault, + struct iommufd_device *idev) +{ + hwpt_fault->size = sizeof(*hwpt_fault); + hwpt_fault->flags = fault->prm.flags; + hwpt_fault->dev_id = idev->obj.id; + hwpt_fault->pasid = fault->prm.pasid; + hwpt_fault->grpid = fault->prm.grpid; + hwpt_fault->perm = fault->prm.perm; + hwpt_fault->addr = fault->prm.addr; +} + +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + size_t fault_size = sizeof(struct iommu_hwpt_pgfault); + struct iommufd_fault *fault = filep->private_data; + struct iommu_hwpt_pgfault data; + struct iommufd_device *idev; + struct iopf_group *group; + struct iopf_fault *iopf; + size_t done = 0; + int rc; + + if (*ppos || count % fault_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while (!list_empty(&fault->deliver) && count > done) { + group = list_first_entry(&fault->deliver, + struct iopf_group, node); + + if (list_count_nodes(&group->faults) * fault_size > count - done) + break; + + idev = (struct iommufd_device *)group->cookie->private; + list_for_each_entry(iopf, &group->faults, list) { + iommufd_compose_fault_message(&iopf->fault, &data, idev); + rc = copy_to_user(buf + done, &data, fault_size); + if (rc) + goto err_unlock; + done += fault_size; + } + + rc = device_add_fault(group); + if (rc) + goto err_unlock; + + list_move_tail(&group->node, &fault->response); + } + mutex_unlock(&fault->mutex); + + return done; +err_unlock: + mutex_unlock(&fault->mutex); + return rc; +} + +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf, + size_t count, loff_t *ppos) +{ + size_t response_size = sizeof(struct iommu_hwpt_page_response); + struct iommufd_fault *fault = filep->private_data; + struct iommu_hwpt_page_response response; + struct iommufd_device *idev; + struct iopf_group *group; + size_t done = 0; + int rc; + + if (*ppos || count % response_size) + return -ESPIPE; + + while (!list_empty(&fault->response) && count > done) { + rc = copy_from_user(&response, buf + done, response_size); + if (rc) + break; + + idev = container_of(iommufd_get_object(fault->ictx, + response.dev_id, + IOMMUFD_OBJ_DEVICE), + struct iommufd_device, obj); + if (IS_ERR(idev)) + break; + + group = device_get_fault(idev, response.grpid); + if (!group || + response.addr != group->last_fault.fault.prm.addr || + ((group->last_fault.fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) && + response.pasid != group->last_fault.fault.prm.pasid)) { + iommufd_put_object(fault->ictx, &idev->obj); + break; + } + + iopf_group_response(group, response.code); + + mutex_lock(&fault->mutex); + list_del(&group->node); + mutex_unlock(&fault->mutex); + + device_remove_fault(group); + iopf_free_group(group); + done += response_size; + + iommufd_put_object(fault->ictx, &idev->obj); + } + + return done; +} + +static __poll_t iommufd_fault_fops_poll(struct file *filep, + struct poll_table_struct *wait) +{ + struct iommufd_fault *fault = filep->private_data; + __poll_t pollflags = 0; + + poll_wait(filep, &fault->wait_queue, wait); + mutex_lock(&fault->mutex); + if (!list_empty(&fault->deliver)) + pollflags = EPOLLIN | EPOLLRDNORM; + mutex_unlock(&fault->mutex); + + return pollflags; +} + +static const struct file_operations iommufd_fault_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .read = iommufd_fault_fops_read, + .write = iommufd_fault_fops_write, + .poll = iommufd_fault_fops_poll, + .llseek = no_llseek, +}; + +static int get_fault_fd(struct iommufd_fault *fault) +{ + struct file *filep; + int fdno; + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) + return fdno; + + filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops, + fault, O_RDWR); + if (IS_ERR(filep)) { + put_unused_fd(fdno); + return PTR_ERR(filep); + } + + fd_install(fdno, filep); + + return fdno; +} + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_fault_alloc *cmd = ucmd->cmd; + struct iommufd_fault *fault; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT); + if (IS_ERR(fault)) + return PTR_ERR(fault); + + fault->ictx = ucmd->ictx; + INIT_LIST_HEAD(&fault->deliver); + INIT_LIST_HEAD(&fault->response); + mutex_init(&fault->mutex); + init_waitqueue_head(&fault->wait_queue); + + rc = get_fault_fd(fault); + if (rc < 0) + goto out_abort; + + cmd->out_fault_id = fault->obj.id; + cmd->out_fault_fd = rc; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &fault->obj); + + return 0; +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); + + return rc; +} + +int iommufd_fault_iopf_handler(struct iopf_group *group) +{ + struct iommufd_hw_pagetable *hwpt = group->cookie->domain->fault_data; + struct iommufd_fault *fault = hwpt->fault; + + mutex_lock(&fault->mutex); + list_add_tail(&group->node, &fault->deliver); + mutex_unlock(&fault->mutex); + + wake_up_interruptible(&fault->wait_queue); + + return 0; +} + +static void release_attach_cookie(struct iopf_attach_cookie *cookie) +{ + struct iommufd_hw_pagetable *hwpt = cookie->domain->fault_data; + struct iommufd_device *idev = cookie->private; + + refcount_dec(&idev->obj.users); + refcount_dec(&hwpt->obj.users); + kfree(cookie); +} + +static int iommufd_fault_iopf_enable(struct iommufd_device *idev) +{ + int ret; + + if (idev->iopf_enabled) + return 0; + + ret = iommu_dev_enable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF); + if (ret) + return ret; + + idev->iopf_enabled = true; + + return 0; +} + +static void iommufd_fault_iopf_disable(struct iommufd_device *idev) +{ + if (!idev->iopf_enabled) + return; + + iommu_dev_disable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF); + idev->iopf_enabled = false; +} + +int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iopf_attach_cookie *cookie; + int ret; + + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); + if (!cookie) + return -ENOMEM; + + refcount_inc(&hwpt->obj.users); + refcount_inc(&idev->obj.users); + cookie->release = release_attach_cookie; + cookie->private = idev; + + if (!idev->iopf_enabled) { + ret = iommufd_fault_iopf_enable(idev); + if (ret) + goto out_put_cookie; + } + + ret = iopf_domain_attach(hwpt->domain, idev->dev, IOMMU_NO_PASID, cookie); + if (ret) + goto out_disable_iopf; + + return 0; +out_disable_iopf: + iommufd_fault_iopf_disable(idev); +out_put_cookie: + release_attach_cookie(cookie); + + return ret; +} + +void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + iopf_domain_detach(hwpt->domain, idev->dev, IOMMU_NO_PASID); + iommufd_fault_iopf_disable(idev); +} + +int iommufd_fault_domain_replace_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + bool iopf_enabled_originally = idev->iopf_enabled; + struct iopf_attach_cookie *cookie = NULL; + int ret; + + if (hwpt->fault_capable) { + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); + if (!cookie) + return -ENOMEM; + + refcount_inc(&hwpt->obj.users); + refcount_inc(&idev->obj.users); + cookie->release = release_attach_cookie; + cookie->private = idev; + + if (!idev->iopf_enabled) { + ret = iommufd_fault_iopf_enable(idev); + if (ret) { + release_attach_cookie(cookie); + return ret; + } + } + } + + ret = iopf_domain_replace(hwpt->domain, idev->dev, IOMMU_NO_PASID, cookie); + if (ret) { + goto out_put_cookie; + } + + if (iopf_enabled_originally && !hwpt->fault_capable) + iommufd_fault_iopf_disable(idev); + + return 0; +out_put_cookie: + if (hwpt->fault_capable) + release_attach_cookie(cookie); + if (iopf_enabled_originally && !idev->iopf_enabled) + iommufd_fault_iopf_enable(idev); + else if (!iopf_enabled_originally && idev->iopf_enabled) + iommufd_fault_iopf_disable(idev); + + return ret; +} diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 33d142f8057d70a77f44e842afdd84b1bee0a970..95ea943f5be539b9ce2ea22560c58c0a2ac0186a 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -8,6 +8,15 @@ #include "../iommu-priv.h" #include "iommufd_private.h" +static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) +{ + if (hwpt->domain) + iommu_domain_free(hwpt->domain); + + if (hwpt->fault) + iommufd_put_object(hwpt->fault->ictx, &hwpt->fault->obj); +} + void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { struct iommufd_hwpt_paging *hwpt_paging = @@ -22,9 +31,7 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) hwpt_paging->common.domain); } - if (hwpt_paging->common.domain) - iommu_domain_free(hwpt_paging->common.domain); - + __iommufd_hwpt_destroy(&hwpt_paging->common); refcount_dec(&hwpt_paging->ioas->obj.users); } @@ -49,9 +56,7 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) struct iommufd_hwpt_nested *hwpt_nested = container_of(obj, struct iommufd_hwpt_nested, common.obj); - if (hwpt_nested->common.domain) - iommu_domain_free(hwpt_nested->common.domain); - + __iommufd_hwpt_destroy(&hwpt_nested->common); refcount_dec(&hwpt_nested->parent->common.obj.users); } @@ -213,7 +218,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt; int rc; - if (flags || !user_data->len || !ops->domain_alloc_user) + if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) || + !user_data->len || !ops->domain_alloc_user) return ERR_PTR(-EOPNOTSUPP); if (parent->auto_domain || !parent->nest_parent) return ERR_PTR(-EINVAL); @@ -227,7 +233,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; - hwpt->domain = ops->domain_alloc_user(idev->dev, flags, + hwpt->domain = ops->domain_alloc_user(idev->dev, 0, parent->common.domain, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); @@ -308,6 +314,20 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) goto out_put_pt; } + if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) { + struct iommufd_fault *fault; + + fault = iommufd_get_fault(ucmd, cmd->fault_id); + if (IS_ERR(fault)) { + rc = PTR_ERR(fault); + goto out_hwpt; + } + hwpt->fault = fault; + hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; + hwpt->domain->fault_data = hwpt; + hwpt->fault_capable = true; + } + cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 991f864d1f9bc175b9acc9b9c445d32ed837f878..9844a1289c0119365e4d7ad0bc07677dfaf125d0 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -128,6 +128,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, + IOMMUFD_OBJ_FAULT, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -292,6 +293,8 @@ int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; + struct iommufd_fault *fault; + bool fault_capable : 1; }; struct iommufd_hwpt_paging { @@ -395,6 +398,9 @@ struct iommufd_device { /* always the physical device */ struct device *dev; bool enforce_cache_coherency; + bool iopf_enabled; + /* outstanding faults awaiting response indexed by fault group id */ + struct xarray faults; }; static inline struct iommufd_device * @@ -426,6 +432,41 @@ void iopt_remove_access(struct io_pagetable *iopt, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); +/* + * An iommufd_fault object represents an interface to deliver I/O page faults + * to the user space. These objects are created/destroyed by the user space and + * associated with hardware page table objects during page-table allocation. + */ +struct iommufd_fault { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + + /* The lists of outstanding faults protected by below mutex. */ + struct mutex mutex; + struct list_head deliver; + struct list_head response; + + struct wait_queue_head wait_queue; +}; + +static inline struct iommufd_fault * +iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_FAULT), + struct iommufd_fault, obj); +} + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); +void iommufd_fault_destroy(struct iommufd_object *obj); +int iommufd_fault_iopf_handler(struct iopf_group *group); +int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev); +void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev); +int iommufd_fault_domain_replace_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev); + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index e854d3f672051b5223e0fec8af741abf03bbffbd..acbbba1c66716e9fb67273436d8f39967b127bfa 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -22,6 +22,7 @@ enum { IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, IOMMU_TEST_OP_DIRTY, IOMMU_TEST_OP_MD_CHECK_IOTLB, + IOMMU_TEST_OP_TRIGGER_IOPF, }; enum { @@ -127,6 +128,13 @@ struct iommu_test_cmd { __u32 id; __u32 iotlb; } check_iotlb; + struct { + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + } trigger_iopf; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 39b32932c61ee4e924e24ab18fc05ec4149829c9..792db077d33eb2a9d5e3776abcf7078f283332f8 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -332,6 +332,7 @@ union ucmd_buffer { struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vfio_ioas vfio_ioas; + struct iommu_fault_alloc fault; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -381,6 +382,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { val64), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), + IOCTL_OP(IOMMU_FAULT_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc, + out_fault_fd), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -513,6 +516,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_hwpt_nested_destroy, .abort = iommufd_hwpt_nested_abort, }, + [IOMMUFD_OBJ_FAULT] = { + .destroy = iommufd_fault_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 7a2199470f3121da91e060bca82315a6944e37b8..53a3be9047a585fabfad5b4bf171d24858c86f85 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -504,6 +504,8 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) return false; } +static struct iopf_queue *mock_iommu_iopf_queue; + static struct iommu_device mock_iommu_device = { }; @@ -514,6 +516,29 @@ static struct iommu_device *mock_probe_device(struct device *dev) return &mock_iommu_device; } +static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ +} + +static int mock_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue) + return -ENODEV; + + return iopf_queue_add_device(mock_iommu_iopf_queue, dev); +} + +static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue) + return -ENODEV; + + iopf_queue_remove_device(mock_iommu_iopf_queue, dev); + + return 0; +} + static const struct iommu_ops mock_ops = { /* * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() @@ -529,6 +554,9 @@ static const struct iommu_ops mock_ops = { .capable = mock_domain_capable, .device_group = generic_device_group, .probe_device = mock_probe_device, + .page_response = mock_domain_page_response, + .dev_enable_feat = mock_dev_enable_feat, + .dev_disable_feat = mock_dev_disable_feat, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, @@ -1375,6 +1403,31 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, return rc; } +static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iopf_fault event = { }; + struct iommufd_device *idev; + + idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + event.fault.prm.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (cmd->trigger_iopf.pasid != IOMMU_NO_PASID) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = cmd->trigger_iopf.addr; + event.fault.prm.pasid = cmd->trigger_iopf.pasid; + event.fault.prm.grpid = cmd->trigger_iopf.grpid; + event.fault.prm.perm = cmd->trigger_iopf.perm; + + iommu_report_device_fault(idev->dev, &event); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return 0; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); @@ -1450,6 +1503,8 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->dirty.page_size, u64_to_user_ptr(cmd->dirty.uptr), cmd->dirty.flags); + case IOMMU_TEST_OP_TRIGGER_IOPF: + return iommufd_test_trigger_iopf(ucmd, cmd); default: return -EOPNOTSUPP; } @@ -1491,6 +1546,9 @@ int __init iommufd_test_init(void) &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; + + mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); + return 0; err_sysfs: @@ -1506,6 +1564,11 @@ int __init iommufd_test_init(void) void iommufd_test_exit(void) { + if (mock_iommu_iopf_queue) { + iopf_queue_free(mock_iommu_iopf_queue); + mock_iommu_iopf_queue = NULL; + } + iommu_device_sysfs_remove(&mock_iommu_device); iommu_device_unregister_bus(&mock_iommu_device, &iommufd_mock_bus_type.bus, diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index a3ad5f0b6c59dddc8fdd17723eb01ff8342c86ab..514aacd6400949d673236c11f5c425415df69675 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); - /* - * This is obsolete, and to be removed from VFIO. It was an incomplete - * idea that got merged. - * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - */ - case VFIO_TYPE1_NESTING_IOMMU: + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 7e2e62ab0869cfb42ae786f9c41b343b42209de7..333d74b8938a3bda81030d4ed5dcfe53ec4b2889 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -95,7 +95,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = { [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ - [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ + [PCI_EXT_CAP_ID_PASID] = PCI_EXT_CAP_PASID_SIZEOF, /* not yet */ [PCI_EXT_CAP_ID_DVSEC] = 0xFF, }; @@ -1606,11 +1606,302 @@ static int vfio_cap_init(struct vfio_pci_core_device *vdev) return 0; } +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_core_device *vdev, + int offset, uint8_t *data, int size) +{ + int ret = 0, data_offset = 0; + + while (size) { + int filled; + + if (size >= 4 && !(offset % 4)) { + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; + u32 dword; + + memcpy(&dword, data + data_offset, 4); + *dwordp = cpu_to_le32(dword); + filled = 4; + } else if (size >= 2 && !(offset % 2)) { + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; + u16 word; + + memcpy(&word, data + data_offset, 2); + *wordp = cpu_to_le16(word); + filled = 2; + } else { + u8 *byte = &vdev->vconfig[offset]; + + memcpy(byte, data + data_offset, 1); + filled = 1; + } + + offset += filled; + data_offset += filled; + size -= filled; + } + + return ret; +} + +static int vfio_pci_get_ecap_content(struct pci_dev *pdev, + int cap, int cap_len, u8 *content) +{ + int pos, offset, len = cap_len, ret = 0; + + pos = pci_find_ext_capability(pdev, cap); + if (!pos) + return -EINVAL; + + offset = 0; + while (len) { + int fetched; + + if (len >= 4 && !(pos % 4)) { + u32 *dwordp = (u32 *) (content + offset); + u32 dword; + __le32 *dwptr = (__le32 *) &dword; + + ret = pci_read_config_dword(pdev, pos, &dword); + if (ret) + return ret; + *dwordp = le32_to_cpu(*dwptr); + fetched = 4; + } else if (len >= 2 && !(pos % 2)) { + u16 *wordp = (u16 *) (content + offset); + u16 word; + __le16 *wptr = (__le16 *) &word; + + ret = pci_read_config_word(pdev, pos, &word); + if (ret) + return ret; + *wordp = le16_to_cpu(*wptr); + fetched = 2; + } else { + u8 *byte = (u8 *) (content + offset); + + ret = pci_read_config_byte(pdev, pos, byte); + if (ret) + return ret; + fetched = 1; + } + + pos += fetched; + offset += fetched; + len -= fetched; + } + + return ret; +} + +struct vfio_pci_pasid_cap_data { + u32 id:16; + u32 version:4; + u32 next:12; + union { + u16 cap_reg_val; + struct { + u16 rsv1:1; + u16 execs:1; + u16 prvs:1; + u16 rsv2:5; + u16 pasid_bits:5; + u16 rsv3:3; + }; + } cap_reg; + union { + u16 control_reg_val; + struct { + u16 paside:1; + u16 exece:1; + u16 prve:1; + u16 rsv4:13; + }; + } control_reg; +}; + +static int vfio_pci_add_pasid_cap(struct vfio_pci_core_device *vdev, + struct pci_dev *pdev, + u16 epos, u16 *next, __le32 **prevp) +{ + u8 *map = vdev->pci_config_map; + int ecap = PCI_EXT_CAP_ID_PASID; + int len = pci_ext_cap_length[ecap]; + struct vfio_pci_pasid_cap_data pasid_cap; + struct vfio_pci_pasid_cap_data vpasid_cap; + int ret; + + /* + * If no cap filled in this function, should make sure the next + * pointer points to current epos. + */ + *next = epos; + + if (!len) { + pr_info("%s: VF %s hiding PASID capability\n", + __func__, dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + /* Add PASID capability*/ + ret = vfio_pci_get_ecap_content(pdev, ecap, + len, (u8 *)&pasid_cap); + if (ret) + goto out; + + if (!pasid_cap.control_reg.paside) { + pr_debug("%s: its PF's PASID capability is not enabled\n", + dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + memcpy(&vpasid_cap, &pasid_cap, len); + + vpasid_cap.next = 0; + /* clear the control reg for guest */ + memset(&vpasid_cap.control_reg, 0x0, + sizeof(vpasid_cap.control_reg)); + + memset(map + epos, vpasid_cap.id, len); + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, + (u8 *)&vpasid_cap, len); + if (!ret) { + /* + * Successfully filled in PASID cap, update + * the next offset in previous cap header, + * and also update caller about the offset + * of next cap if any. + */ + u32 val = epos; + **prevp &= cpu_to_le32(~(0xffcU << 20)); + **prevp |= cpu_to_le32(val << 20); + *prevp = (__le32 *)&vdev->vconfig[epos]; + *next = epos + len; + } + +out: + return ret; +} + +struct vfio_pci_pri_cap_data { + u32 id:16; + u32 version:4; + u32 next:12; + union { + u16 control_reg_val; + struct { + u16 enable:1; + u16 reset:1; + u16 rsv1:14; + }; + } control_reg; + union { + u16 status_reg_val; + struct { + u16 rf:1; + u16 uprgi:1; + u16 rsv2:6; + u16 stop:1; + u16 rsv3:6; + u16 pasid_required:1; + }; + } status_reg; + u32 prq_capacity; + u32 prq_quota; +}; + +static int vfio_pci_add_pri_cap(struct vfio_pci_core_device *vdev, + struct pci_dev *pdev, + u16 epos, u16 *next, __le32 **prevp) +{ + u8 *map = vdev->pci_config_map; + int ecap = PCI_EXT_CAP_ID_PRI; + int len = pci_ext_cap_length[ecap]; + struct vfio_pci_pri_cap_data pri_cap; + struct vfio_pci_pri_cap_data vpri_cap; + int ret; + + /* + * If no cap filled in this function, should make sure the next + * pointer points to current epos. + */ + *next = epos; + + if (!len) { + pr_info("%s: VF %s hiding PRI capability\n", + __func__, dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + /* Add PASID capability*/ + ret = vfio_pci_get_ecap_content(pdev, ecap, + len, (u8 *)&pri_cap); + if (ret) + goto out; + + if (!pri_cap.control_reg.enable) { + pr_debug("%s: its PF's PRI capability is not enabled\n", + dev_name(&vdev->pdev->dev)); + ret = 0; + goto out; + } + + memcpy(&vpri_cap, &pri_cap, len); + + vpri_cap.next = 0; + /* clear the control reg for guest */ + memset(&vpri_cap.control_reg, 0x0, + sizeof(vpri_cap.control_reg)); + + memset(map + epos, vpri_cap.id, len); + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, + (u8 *)&vpri_cap, len); + if (!ret) { + /* + * Successfully filled in PASID cap, update + * the next offset in previous cap header, + * and also update caller about the offset + * of next cap if any. + */ + u32 val = epos; + **prevp &= cpu_to_le32(~(0xffcU << 20)); + **prevp |= cpu_to_le32(val << 20); + *prevp = (__le32 *)&vdev->vconfig[epos]; + *next = epos + len; + } + +out: + return ret; +} + +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_core_device *vdev, + struct pci_dev *pdev, u16 start_epos, __le32 *prev) +{ + __le32 *__prev = prev; + u16 epos = start_epos, epos_next = start_epos; + int ret = 0; + + /* Add PASID capability*/ + ret = vfio_pci_add_pasid_cap(vdev, pdev, epos, + &epos_next, &__prev); + if (ret) + return ret; + + /* Add PRI capability */ + epos = epos_next; + ret = vfio_pci_add_pri_cap(vdev, pdev, epos, + &epos_next, &__prev); + + return ret; +} + static int vfio_ecap_init(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; u8 *map = vdev->pci_config_map; - u16 epos; + u16 epos, epos_max; __le32 *prev = NULL; int loops, ret, ecaps = 0; @@ -1618,6 +1909,7 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev) return 0; epos = PCI_CFG_SPACE_SIZE; + epos_max = PCI_CFG_SPACE_SIZE; loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; @@ -1642,6 +1934,9 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev) } } + if (epos_max <= (epos + len)) + epos_max = epos + len; + if (!len) { pci_dbg(pdev, "%s: hiding ecap %#x@%#x\n", __func__, ecap, epos); @@ -1701,6 +1996,18 @@ static int vfio_ecap_init(struct vfio_pci_core_device *vdev) if (!ecaps) *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; +#ifdef CONFIG_PCI_ATS + if (pdev->is_virtfn) { + struct pci_dev *physfn = pdev->physfn; + + ret = vfio_pci_add_emulated_cap_for_vf(vdev, + physfn, epos_max, prev); + if (ret) + pr_info("%s, failed to add special caps for VF %s\n", + __func__, dev_name(&vdev->pdev->dev)); + } +#endif + return 0; } @@ -1859,6 +2166,17 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev, return i; } +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id) +{ +#ifdef CONFIG_PCI_ATS + return (pdev->is_virtfn && + (cap_id == PCI_EXT_CAP_ID_PASID || + cap_id == PCI_EXT_CAP_ID_PRI)); +#else + return false; +#endif +} + static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -1892,7 +2210,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user if (cap_id == PCI_CAP_ID_INVALID) { perm = &unassigned_perms; cap_start = *ppos; - } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT || + vfio_pci_need_virt_perm(pdev, cap_id)) { perm = &virt_perms; cap_start = *ppos; } else { diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9c4adf11dbbe357c14ffc926fe496a40b0f4ba22..0b8e0571ecf3672fb6dbfd0b1bb659ae8efae19f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -73,7 +73,6 @@ struct vfio_iommu { uint64_t num_non_pinned_groups; uint64_t num_non_hwdbm_domains; bool v2; - bool nesting; bool dirty_page_tracking; struct list_head emulated_iommu_groups; bool dirty_log_get_no_clear; @@ -2448,12 +2447,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (!domain->domain) goto out_free_domain; - if (iommu->nesting) { - ret = iommu_enable_nesting(domain->domain); - if (ret) - goto out_domain; - } - ret = iommu_attach_group(domain->domain, group->iommu_group); if (ret) goto out_domain; @@ -2801,9 +2794,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) switch (arg) { case VFIO_TYPE1_IOMMU: break; - case VFIO_TYPE1_NESTING_IOMMU: - iommu->nesting = true; - fallthrough; + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: case VFIO_TYPE1v2_IOMMU: iommu->v2 = true; break; @@ -2898,7 +2889,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, switch (arg) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_NESTING_IOMMU: case VFIO_UNMAP_ALL: return 1; case VFIO_UPDATE_VADDR: diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 98b0c455aec804aebc159f4740674916c49461fb..dd6b261eccded5dc28a36f05d191ae70c05e8eef 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -116,6 +116,16 @@ struct iommu_page_response { u32 code; }; +struct iopf_attach_cookie { + struct iommu_domain *domain; + struct device *dev; + unsigned int pasid; + refcount_t users; + + void *private; + void (*release)(struct iopf_attach_cookie *cookie); +}; + struct iopf_fault { struct iommu_fault fault; /* node for pending lists */ @@ -128,9 +138,11 @@ struct iopf_group { /* list node for iommu_fault_param::faults */ struct list_head pending_node; struct work_struct work; - struct iommu_domain *domain; /* The device's fault data parameter. */ struct iommu_fault_param *fault_param; + struct iopf_attach_cookie *cookie; + /* Used by handler provider to hook the group on its own lists. */ + struct list_head node; }; /** @@ -519,6 +531,7 @@ static inline int __iommu_copy_struct_from_user_array( * Upon failure, ERR_PTR must be returned. * @domain_alloc_paging: Allocate an iommu_domain that can be used for * UNMANAGED, DMA, and DMA_FQ domain types. + * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -559,6 +572,8 @@ struct iommu_ops { struct device *dev, u32 flags, struct iommu_domain *parent, const struct iommu_user_data *user_data); struct iommu_domain *(*domain_alloc_paging)(struct device *dev); + struct iommu_domain *(*domain_alloc_sva)(struct device *dev, + struct mm_struct *mm); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); @@ -624,13 +639,14 @@ struct iommu_ops { * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform * specific mechanisms. - * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @support_dirty_log: Check whether domain supports dirty log tracking * @switch_dirty_log: Perform actions to start|stop dirty log tracking * @sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap * @clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap * @free: Release the domain after use. + * @get_msi_mapping_domain: Return the related iommu_domain that should hold the + * MSI cookie and accept mapping(s). */ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); @@ -656,7 +672,6 @@ struct iommu_domain_ops { dma_addr_t iova); bool (*enforce_cache_coherency)(struct iommu_domain *domain); - int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); @@ -676,6 +691,8 @@ struct iommu_domain_ops { unsigned long *bitmap, unsigned long base_iova, unsigned long bitmap_pgshift); void (*free)(struct iommu_domain *domain); + struct iommu_domain * + (*get_msi_mapping_domain)(struct iommu_domain *domain); }; /** @@ -716,6 +733,7 @@ struct iommu_fault_param { struct device *dev; struct iopf_queue *queue; struct list_head queue_list; + struct xarray pasid_cookie; struct list_head partial; struct list_head faults; @@ -839,7 +857,6 @@ extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); -int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); @@ -1616,6 +1633,12 @@ void iopf_free_group(struct iopf_group *group); void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); void iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status); +int iopf_domain_attach(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid, struct iopf_attach_cookie *cookie); +void iopf_domain_detach(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid); +int iopf_domain_replace(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid, struct iopf_attach_cookie *cookie); #else static inline int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) @@ -1660,5 +1683,24 @@ static inline void iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status) { } + +static inline int iopf_domain_attach(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iopf_attach_cookie *cookie) +{ + return -ENODEV; +} + +static inline void iopf_domain_detach(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ +} + +static inline int iopf_domain_replace(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iopf_attach_cookie *cookie) +{ + return -ENODEV; +} #endif /* CONFIG_IOMMU_IOPF */ #endif /* __LINUX_IOMMU_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 1dfeaa2e649ee41751162073463df20cb0a130ad..aa57a6195d89adcbed0bee9b4ee564a4c99f2177 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -50,6 +50,7 @@ enum { IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, IOMMUFD_CMD_HWPT_INVALIDATE, + IOMMUFD_CMD_FAULT_ALLOC, }; /** @@ -356,10 +357,13 @@ struct iommu_vfio_ioas { * the parent HWPT in a nesting configuration. * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is * enforced on device attachment + * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is + * valid. */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, + IOMMU_HWPT_FAULT_ID_VALID = 1 << 2, }; /** @@ -390,14 +394,34 @@ struct iommu_hwpt_vtd_s1 { __u32 __reserved; }; +/** + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for + * a user stage-1 Context Descriptor Table. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * @sid: The user space Stream ID to index the user Stream Table Entry @ste + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + */ +struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; + __u32 sid; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE, IOMMU_HWPT_DATA_VTD_S1, + IOMMU_HWPT_DATA_ARM_SMMUV3, }; /** @@ -410,6 +434,8 @@ enum iommu_hwpt_data_type { * @__reserved: Must be 0 * @data_type: One of enum iommu_hwpt_data_type * @data_len: Length of the type specific data + * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of + * IOMMU_HWPT_FAULT_ID_VALID is set. * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object @@ -440,6 +466,7 @@ struct iommu_hwpt_alloc { __u32 __reserved; __u32 data_type; __u32 data_len; + __u32 fault_id; __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) @@ -475,15 +502,50 @@ struct iommu_hw_info_vtd { __aligned_u64 ecap_reg; }; +/** + * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information + * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + * + * @flags: Must be set to 0 + * @__reserved: Must be 0 + * @idr: Implemented features for ARM SMMU Non-secure programming interface + * @iidr: Information about the implementation and implementer of ARM SMMU, + * and architecture version supported + * @aidr: ARM SMMU architecture version + * + * For the details of @idr, @iidr and @aidr, please refer to the chapters + * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. + * + * User space should read the underlying ARM SMMUv3 hardware information for + * the list of supported features. + * + * Note that these values reflect the raw HW capability, without any insight if + * any required kernel driver support is present. Bits may be set indicating the + * HW has functionality that is lacking kernel software support, such as BTM. If + * a VMM is using this information to construct emulated copies of these + * registers it should only forward bits that it knows it can support. + * + * In future, presence of required kernel support will be indicated in flags. + */ +struct iommu_hw_info_arm_smmuv3 { + __u32 flags; + __u32 __reserved; + __u32 idr[6]; + __u32 iidr; + __u32 aidr; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type + * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE, IOMMU_HW_INFO_TYPE_INTEL_VTD, + IOMMU_HW_INFO_TYPE_ARM_SMMUV3, }; /** @@ -659,6 +721,26 @@ struct iommu_hwpt_vtd_s1_invalidate { __u32 __reserved; }; +/** + * struct iommu_hwpt_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ. + * Must be little-endian. + * + * Supported command list: + * CMDQ_OP_TLBI_NSNH_ALL + * CMDQ_OP_TLBI_NH_VA + * CMDQ_OP_TLBI_NH_VAA + * CMDQ_OP_TLBI_NH_ALL + * CMDQ_OP_TLBI_NH_ASID + * CMDQ_OP_ATC_INV + * CMDQ_OP_CFGI_CD + * CMDQ_OP_CFGI_CD_ALL + */ +struct iommu_hwpt_arm_smmuv3_invalidate { + __aligned_u64 cmd[2]; +}; + /** * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) * @size: sizeof(struct iommu_hwpt_invalidate) @@ -692,4 +774,88 @@ struct iommu_hwpt_invalidate { __u32 __reserved; }; #define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) + +/** + * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is + * valid. + * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group. + */ +enum iommu_hwpt_pgfault_flags { + IOMMU_PGFAULT_FLAGS_PASID_VALID = (1 << 0), + IOMMU_PGFAULT_FLAGS_LAST_PAGE = (1 << 1), +}; + +/** + * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_PERM_READ: request for read permission + * @IOMMU_PGFAULT_PERM_WRITE: request for write permission + * @IOMMU_PGFAULT_PERM_EXEC: request for execute permission + * @IOMMU_PGFAULT_PERM_PRIV: request for privileged permission + */ +enum iommu_hwpt_pgfault_perm { + IOMMU_PGFAULT_PERM_READ = (1 << 0), + IOMMU_PGFAULT_PERM_WRITE = (1 << 1), + IOMMU_PGFAULT_PERM_EXEC = (1 << 2), + IOMMU_PGFAULT_PERM_PRIV = (1 << 3), +}; + +/** + * struct iommu_hwpt_pgfault - iommu page fault data + * @size: sizeof(struct iommu_hwpt_pgfault) + * @flags: Combination of enum iommu_hwpt_pgfault_flags + * @dev_id: id of the originated device + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: Combination of enum iommu_hwpt_pgfault_perm + * @addr: page address + */ +struct iommu_hwpt_pgfault { + __u32 size; + __u32 flags; + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; +}; + +/** + * struct iommu_hwpt_page_response - IOMMU page fault response + * @size: sizeof(struct iommu_hwpt_page_response) + * @flags: Must be set to 0 + * @dev_id: device ID of target device for the response + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @code: response code. The supported codes include: + * 0: Successful; 1: Response Failure; 2: Invalid Request. + * @addr: The fault address. Must match the addr field of the + * last iommu_hwpt_pgfault of a reported iopf group. + */ +struct iommu_hwpt_page_response { + __u32 size; + __u32 flags; + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 code; + __u64 addr; +}; + +/** + * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_ALLOC) + * @size: sizeof(struct iommu_fault_alloc) + * @flags: Must be 0 + * @out_fault_id: The ID of the new FAULT + * @out_fault_fd: The fd of the new FAULT + * + * Explicitly allocate a fault handling object. + */ +struct iommu_fault_alloc { + __u32 size; + __u32 flags; + __u32 out_fault_id; + __u32 out_fault_fd; +}; +#define IOMMU_FAULT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_ALLOC) #endif diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 2b78d03c0c15c0c1ce42774510b2ce27e4a49711..4c5acb573bbfe8c795d1586a65e1083d137192e3 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -35,7 +35,7 @@ #define VFIO_EEH 5 /* Two-stage IOMMU */ -#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ +#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ #define VFIO_SPAPR_TCE_v2_IOMMU 7 diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index edf1c99c9936c8549e8a2938a2ff11875197b3d4..17cc2f67228495febc892062c98e5d1c9c1f8df4 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -279,6 +279,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) uint32_t parent_hwpt_id = 0; uint32_t parent_hwpt_id_not_work = 0; uint32_t test_hwpt_id = 0; + uint32_t iopf_hwpt_id; + uint32_t fault_id; + uint32_t fault_fd; if (self->device_id) { /* Negative tests */ @@ -326,6 +329,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) sizeof(data)); /* Allocate two nested hwpts sharing one common parent hwpt */ + test_ioctl_fault_alloc(&fault_id, &fault_fd); test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, &nested_hwpt_id[0], IOMMU_HWPT_DATA_SELFTEST, &data, @@ -334,6 +338,10 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) &nested_hwpt_id[1], IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_cmd_hwpt_alloc_iopf(self->device_id, parent_hwpt_id, fault_id, + IOMMU_HWPT_FAULT_ID_VALID, &iopf_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], IOMMU_TEST_IOTLB_DEFAULT); test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], @@ -504,14 +512,23 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) _test_ioctl_destroy(self->fd, nested_hwpt_id[1])); test_ioctl_destroy(nested_hwpt_id[0]); + /* Switch from nested_hwpt_id[1] to iopf_hwpt_id */ + test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, iopf_hwpt_id)); + /* Trigger an IOPF on the device */ + test_cmd_trigger_iopf(self->device_id, fault_fd); + /* Detach from nested_hwpt_id[1] and destroy it */ test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id); test_ioctl_destroy(nested_hwpt_id[1]); + test_ioctl_destroy(iopf_hwpt_id); /* Detach from the parent hw_pagetable and destroy it */ test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(parent_hwpt_id); test_ioctl_destroy(parent_hwpt_id_not_work); + test_ioctl_destroy(fault_id); } else { test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0, &parent_hwpt_id); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index f590417cd67a95bf31065f9b7682bc7c627ebf3a..c5d5e69452b010946dc42e2f0dc1a71c0a0480c7 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -615,7 +615,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id, + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, 0, &hwpt_id, IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 8d2b46b2114da814f75740992c0dc4b1be14d33b..9be62e5781e794dce40c1935799a92c2519e6e88 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -153,7 +153,7 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain_replace(self->fd, stdev_id, \ pt_id, NULL)) -static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, +static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_id, __u32 flags, __u32 *hwpt_id, __u32 data_type, void *data, size_t data_len) { @@ -165,6 +165,7 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, .data_type = data_type, .data_len = data_len, .data_uptr = (uint64_t)data, + .fault_id = ft_id, }; int ret; @@ -177,24 +178,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, } #define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ - ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ 0)) #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ - self->fd, device_id, pt_id, flags, \ + self->fd, device_id, pt_id, 0, flags, \ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0)) #define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id, \ data_type, data, data_len) \ - ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, data_type, data, data_len)) #define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \ data_type, data, data_len) \ EXPECT_ERRNO(_errno, \ - _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, data_type, data, data_len)) +#define test_cmd_hwpt_alloc_iopf(device_id, pt_id, fault_id, flags, hwpt_id, \ + data_type, data, data_len) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, fault_id, \ + flags, hwpt_id, data_type, data, \ + data_len)) + #define test_cmd_hwpt_check_iotlb(hwpt_id, iotlb_id, expected) \ ({ \ struct iommu_test_cmd test_cmd = { \ @@ -684,3 +691,69 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, #define test_cmd_get_hw_capabilities(device_id, caps, mask) \ ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps)) + +static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd) +{ + struct iommu_fault_alloc cmd = { + .size = sizeof(cmd), + }; + int ret; + + ret = ioctl(fd, IOMMU_FAULT_ALLOC, &cmd); + if (ret) + return ret; + *fault_id = cmd.out_fault_id; + *fault_fd = cmd.out_fault_fd; + return 0; +} + +#define test_ioctl_fault_alloc(fault_id, fault_fd) \ + ({ \ + ASSERT_EQ(0, _test_ioctl_fault_alloc(self->fd, fault_id, \ + fault_fd)); \ + ASSERT_NE(0, *(fault_id)); \ + ASSERT_NE(0, *(fault_fd)); \ + }) + +static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd) +{ + struct iommu_test_cmd trigger_iopf_cmd = { + .size = sizeof(trigger_iopf_cmd), + .op = IOMMU_TEST_OP_TRIGGER_IOPF, + .trigger_iopf = { + .dev_id = device_id, + .pasid = 0x1, + .grpid = 0x2, + .perm = IOMMU_PGFAULT_PERM_READ | IOMMU_PGFAULT_PERM_WRITE, + .addr = 0xdeadbeaf, + }, + }; + struct iommu_hwpt_page_response response = { + .size = sizeof(struct iommu_hwpt_page_response), + .dev_id = device_id, + .pasid = 0x1, + .grpid = 0x2, + .code = 0, + .addr = 0xdeadbeaf, + }; + struct iommu_hwpt_pgfault fault = {}; + ssize_t bytes; + int ret; + + ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_IOPF), &trigger_iopf_cmd); + if (ret) + return ret; + + bytes = read(fault_fd, &fault, sizeof(fault)); + if (bytes < 0) + return bytes; + + bytes = write(fault_fd, &response, sizeof(response)); + if (bytes < 0) + return bytes; + + return 0; +} + +#define test_cmd_trigger_iopf(device_id, fault_fd) \ + ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd))