diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0aca57a90cd9211eac068276fb3c9658119274e5..7e54b6e91071e267e47433dcfdda718d9f22e1f0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2254,7 +2254,7 @@ forcing Dual Address Cycle for PCI cards supporting greater than 32-bit addressing. - iommu.strict= [ARM64, X86] Configure TLB invalidation behaviour + iommu.strict= [ARM64, X86, S390] Configure TLB invalidation behaviour Format: { "0" | "1" } 0 - Lazy mode. Request that DMA unmap operations use deferred @@ -5729,9 +5729,10 @@ s390_iommu= [HW,S390] Set s390 IOTLB flushing mode strict - With strict flushing every unmap operation will result in - an IOTLB flush. Default is lazy flushing before reuse, - which is faster. + With strict flushing every unmap operation will result + in an IOTLB flush. Default is lazy flushing before + reuse, which is faster. Deprecated, equivalent to + iommu.strict=1. s390_iommu_aperture= [KNL,S390] Specifies the size of the per device DMA address space diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 33a1b2a1bf9551330392c5e3fc2efc9424043c4a..18c1b5ba4aeac57225dbfbaeab24b3d141827dd9 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1074,10 +1074,10 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, - struct iommu_table *tbl, - unsigned long entry, unsigned long *hpa, - enum dma_data_direction *direction) +long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) { long ret; unsigned long size = 0; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 2de7f6dcd32b06badc8741984ab1d6d542963c07..11e062b47d3f80681cb0362f85cfa7ca7fcda7f6 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -157,7 +157,7 @@ static int die_owner = -1; static unsigned int die_nest_count; static int die_counter; -extern void panic_flush_kmsg_start(void) +void panic_flush_kmsg_start(void) { /* * These are mostly taken from kernel/panic.c, but tries to do @@ -170,7 +170,7 @@ extern void panic_flush_kmsg_start(void) bust_spinlocks(1); } -extern void panic_flush_kmsg_end(void) +void panic_flush_kmsg_end(void) { kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 395659f2f4c8efc81a9677ef0c160a9a8c08246d..c7623fd4d6d92fc92aeddd2ffd9ba64e2199391c 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -77,8 +77,8 @@ static void kvm_spapr_tce_liobn_put(struct kref *kref) call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free); } -extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, - struct iommu_group *grp) +void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp) { int i; struct kvmppc_spapr_tce_table *stt; @@ -105,8 +105,8 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, rcu_read_unlock(); } -extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, - struct iommu_group *grp) +long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp) { struct kvmppc_spapr_tce_table *stt = NULL; bool found = false; diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index b248694e00247b57b5be3338535038e9910335d1..3f74f1cf37dfda0ef2a3c1efbbdbd222c2485366 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -159,13 +159,6 @@ struct zpci_dev { unsigned long *dma_table; int tlb_refresh; - spinlock_t iommu_bitmap_lock; - unsigned long *iommu_bitmap; - unsigned long *lazy_bitmap; - unsigned long iommu_size; - unsigned long iommu_pages; - unsigned int next_bit; - struct iommu_device iommu_dev; /* IOMMU core handle */ char res_name[16]; diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index d6189ed14f84874ac1f135d1480377d64f6f7383..f0c677ddd270606df61e7fd6ccd9b6c17f89f6b9 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -50,6 +50,9 @@ struct clp_fh_list_entry { #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 +/* PCI function type numbers */ +#define PCI_FUNC_TYPE_ISM 0x5 /* ISM device */ + extern bool zpci_unique_uid; struct clp_rsp_slpc_pci { diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 7119c04c51c5c864677de1ed928b33a8b74b6d74..42d7cc4262ca48d1368cc31ab804f07bf558a7b7 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -82,117 +82,16 @@ enum zpci_ioat_dtype { #define ZPCI_TABLE_VALID_MASK 0x20 #define ZPCI_TABLE_PROT_MASK 0x200 -static inline unsigned int calc_rtx(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; -} - -static inline unsigned int calc_sx(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK; -} - -static inline unsigned int calc_px(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK; -} - -static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa) -{ - *entry &= ZPCI_PTE_FLAG_MASK; - *entry |= (pfaa & ZPCI_PTE_ADDR_MASK); -} - -static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto) -{ - *entry &= ZPCI_RTE_FLAG_MASK; - *entry |= (sto & ZPCI_RTE_ADDR_MASK); - *entry |= ZPCI_TABLE_TYPE_RTX; -} - -static inline void set_st_pto(unsigned long *entry, phys_addr_t pto) -{ - *entry &= ZPCI_STE_FLAG_MASK; - *entry |= (pto & ZPCI_STE_ADDR_MASK); - *entry |= ZPCI_TABLE_TYPE_SX; -} - -static inline void validate_rt_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry &= ~ZPCI_TABLE_OFFSET_MASK; - *entry |= ZPCI_TABLE_VALID; - *entry |= ZPCI_TABLE_LEN_RTX; -} - -static inline void validate_st_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry |= ZPCI_TABLE_VALID; -} - -static inline void invalidate_pt_entry(unsigned long *entry) -{ - WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); - *entry &= ~ZPCI_PTE_VALID_MASK; - *entry |= ZPCI_PTE_INVALID; -} - -static inline void validate_pt_entry(unsigned long *entry) -{ - WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID); - *entry &= ~ZPCI_PTE_VALID_MASK; - *entry |= ZPCI_PTE_VALID; -} - -static inline void entry_set_protected(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_PROT_MASK; - *entry |= ZPCI_TABLE_PROTECTED; -} - -static inline void entry_clr_protected(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_PROT_MASK; - *entry |= ZPCI_TABLE_UNPROTECTED; -} - -static inline int reg_entry_isvalid(unsigned long entry) -{ - return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; -} - -static inline int pt_entry_isvalid(unsigned long entry) -{ - return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; -} - -static inline unsigned long *get_rt_sto(unsigned long entry) -{ - if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) - return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); - else - return NULL; - -} - -static inline unsigned long *get_st_pto(unsigned long entry) -{ - if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX) - return phys_to_virt(entry & ZPCI_STE_ADDR_MASK); - else - return NULL; -} - -/* Prototypes */ -void dma_free_seg_table(unsigned long); -unsigned long *dma_alloc_cpu_table(gfp_t gfp); -void dma_cleanup_tables(unsigned long *); -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, - gfp_t gfp); -void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags); - -extern const struct dma_map_ops s390_pci_dma_ops; +struct zpci_iommu_ctrs { + atomic64_t mapped_pages; + atomic64_t unmapped_pages; + atomic64_t global_rpcits; + atomic64_t sync_map_rpcits; + atomic64_t sync_rpcits; +}; + +struct zpci_dev; +struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev); #endif diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 5ae31ca9dd441d6180b13e624c0adaca4e49fc23..0547a10406e72a1a0745a842228130bc0710f1a0 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -3,7 +3,7 @@ # Makefile for the s390 PCI subsystem. # -obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_clp.o pci_sysfs.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ pci_bus.o pci_kvm_hook.o obj-$(CONFIG_PCI_IOV) += pci_iov.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 777362cb4ea80ba17a4d0e8929815d11a8c513f0..7b83f669a228afb0f2629c739cf1fff3b6e41a3c 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -124,7 +124,11 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; - fib.pal = limit; + /* Work around off by one in ISM virt device */ + if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base) + fib.pal = limit + (1 << 12); + else + fib.pal = limit; fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, status); @@ -582,7 +586,6 @@ int pcibios_device_add(struct pci_dev *pdev) pdev->no_vf_scan = 1; pdev->dev.groups = zpci_attr_groups; - pdev->dev.dma_ops = &s390_pci_dma_ops; zpci_map_resources(pdev); for (i = 0; i < PCI_STD_NUM_BARS; i++) { @@ -756,8 +759,6 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) if (zdev->dma_table) rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, virt_to_phys(zdev->dma_table), &status); - else - rc = zpci_dma_init_device(zdev); if (rc) { zpci_disable_device(zdev); return rc; @@ -865,11 +866,6 @@ int zpci_deconfigure_device(struct zpci_dev *zdev) if (zdev->zbus->bus) zpci_bus_remove_device(zdev, false); - if (zdev->dma_table) { - rc = zpci_dma_exit_device(zdev); - if (rc) - return rc; - } if (zdev_enabled(zdev)) { rc = zpci_disable_device(zdev); if (rc) @@ -918,8 +914,6 @@ void zpci_release_device(struct kref *kref) if (zdev->zbus->bus) zpci_bus_remove_device(zdev, false); - if (zdev->dma_table) - zpci_dma_exit_device(zdev); if (zdev_enabled(zdev)) zpci_disable_device(zdev); @@ -1109,10 +1103,6 @@ static int __init pci_base_init(void) if (rc) goto out_irq; - rc = zpci_dma_init(); - if (rc) - goto out_dma; - rc = clp_scan_pci_devices(); if (rc) goto out_find; @@ -1122,8 +1112,6 @@ static int __init pci_base_init(void) return 0; out_find: - zpci_dma_exit(); -out_dma: zpci_irq_exit(); out_irq: zpci_mem_exit(); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 32245b970a0cf1d72ace477e750f5c340d708943..daa5d7450c7d383b254d225d022e23a14c105e41 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -47,11 +47,6 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev) rc = zpci_enable_device(zdev); if (rc) return rc; - rc = zpci_dma_init_device(zdev); - if (rc) { - zpci_disable_device(zdev); - return rc; - } } if (!zdev->has_resources) { diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index ca6bd98eec136e9a920a250729108730ec652ef5..6dde2263c79d1f57274e016e5867fe3a09ed473c 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -53,9 +53,11 @@ static char *pci_fmt3_names[] = { }; static char *pci_sw_names[] = { - "Allocated pages", "Mapped pages", "Unmapped pages", + "Global RPCITs", + "Sync Map RPCITs", + "Sync RPCITs", }; static void pci_fmb_show(struct seq_file *m, char *name[], int length, @@ -69,10 +71,14 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length, static void pci_sw_counter_show(struct seq_file *m) { - struct zpci_dev *zdev = m->private; - atomic64_t *counter = &zdev->allocated_pages; + struct zpci_iommu_ctrs *ctrs = zpci_get_iommu_ctrs(m->private); + atomic64_t *counter; int i; + if (!ctrs) + return; + + counter = &ctrs->mapped_pages; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], atomic64_read(counter)); diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c deleted file mode 100644 index 99209085c75bcb8b2a9f81687924aae83fceb78e..0000000000000000000000000000000000000000 --- a/arch/s390/pci/pci_dma.c +++ /dev/null @@ -1,746 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corp. 2012 - * - * Author(s): - * Jan Glauber - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *dma_region_table_cache; -static struct kmem_cache *dma_page_table_cache; -static int s390_iommu_strict; -static u64 s390_iommu_aperture; -static u32 s390_iommu_aperture_factor = 1; - -static int zpci_refresh_global(struct zpci_dev *zdev) -{ - return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma, - zdev->iommu_pages * PAGE_SIZE); -} - -unsigned long *dma_alloc_cpu_table(gfp_t gfp) -{ - unsigned long *table, *entry; - - table = kmem_cache_alloc(dma_region_table_cache, gfp); - if (!table) - return NULL; - - for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++) - *entry = ZPCI_TABLE_INVALID; - return table; -} - -static void dma_free_cpu_table(void *table) -{ - kmem_cache_free(dma_region_table_cache, table); -} - -static unsigned long *dma_alloc_page_table(gfp_t gfp) -{ - unsigned long *table, *entry; - - table = kmem_cache_alloc(dma_page_table_cache, gfp); - if (!table) - return NULL; - - for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++) - *entry = ZPCI_PTE_INVALID; - return table; -} - -static void dma_free_page_table(void *table) -{ - kmem_cache_free(dma_page_table_cache, table); -} - -static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp) -{ - unsigned long old_rte, rte; - unsigned long *sto; - - rte = READ_ONCE(*rtep); - if (reg_entry_isvalid(rte)) { - sto = get_rt_sto(rte); - } else { - sto = dma_alloc_cpu_table(gfp); - if (!sto) - return NULL; - - set_rt_sto(&rte, virt_to_phys(sto)); - validate_rt_entry(&rte); - entry_clr_protected(&rte); - - old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte); - if (old_rte != ZPCI_TABLE_INVALID) { - /* Somone else was faster, use theirs */ - dma_free_cpu_table(sto); - sto = get_rt_sto(old_rte); - } - } - return sto; -} - -static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp) -{ - unsigned long old_ste, ste; - unsigned long *pto; - - ste = READ_ONCE(*step); - if (reg_entry_isvalid(ste)) { - pto = get_st_pto(ste); - } else { - pto = dma_alloc_page_table(gfp); - if (!pto) - return NULL; - set_st_pto(&ste, virt_to_phys(pto)); - validate_st_entry(&ste); - entry_clr_protected(&ste); - - old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste); - if (old_ste != ZPCI_TABLE_INVALID) { - /* Somone else was faster, use theirs */ - dma_free_page_table(pto); - pto = get_st_pto(old_ste); - } - } - return pto; -} - -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, - gfp_t gfp) -{ - unsigned long *sto, *pto; - unsigned int rtx, sx, px; - - rtx = calc_rtx(dma_addr); - sto = dma_get_seg_table_origin(&rto[rtx], gfp); - if (!sto) - return NULL; - - sx = calc_sx(dma_addr); - pto = dma_get_page_table_origin(&sto[sx], gfp); - if (!pto) - return NULL; - - px = calc_px(dma_addr); - return &pto[px]; -} - -void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags) -{ - unsigned long pte; - - pte = READ_ONCE(*ptep); - if (flags & ZPCI_PTE_INVALID) { - invalidate_pt_entry(&pte); - } else { - set_pt_pfaa(&pte, page_addr); - validate_pt_entry(&pte); - } - - if (flags & ZPCI_TABLE_PROTECTED) - entry_set_protected(&pte); - else - entry_clr_protected(&pte); - - xchg(ptep, pte); -} - -static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, - dma_addr_t dma_addr, size_t size, int flags) -{ - unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - phys_addr_t page_addr = (pa & PAGE_MASK); - unsigned long *entry; - int i, rc = 0; - - if (!nr_pages) - return -EINVAL; - - if (!zdev->dma_table) - return -EINVAL; - - for (i = 0; i < nr_pages; i++) { - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr, - GFP_ATOMIC); - if (!entry) { - rc = -ENOMEM; - goto undo_cpu_trans; - } - dma_update_cpu_trans(entry, page_addr, flags); - page_addr += PAGE_SIZE; - dma_addr += PAGE_SIZE; - } - -undo_cpu_trans: - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { - flags = ZPCI_PTE_INVALID; - while (i-- > 0) { - page_addr -= PAGE_SIZE; - dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr, - GFP_ATOMIC); - if (!entry) - break; - dma_update_cpu_trans(entry, page_addr, flags); - } - } - return rc; -} - -static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, - size_t size, int flags) -{ - unsigned long irqflags; - int ret; - - /* - * With zdev->tlb_refresh == 0, rpcit is not required to establish new - * translations when previously invalid translation-table entries are - * validated. With lazy unmap, rpcit is skipped for previously valid - * entries, but a global rpcit is then required before any address can - * be re-used, i.e. after each iommu bitmap wrap-around. - */ - if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) { - if (!zdev->tlb_refresh) - return 0; - } else { - if (!s390_iommu_strict) - return 0; - } - - ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, - PAGE_ALIGN(size)); - if (ret == -ENOMEM && !s390_iommu_strict) { - /* enable the hypervisor to free some resources */ - if (zpci_refresh_global(zdev)) - goto out; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags); - bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, - zdev->lazy_bitmap, zdev->iommu_pages); - bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags); - ret = 0; - } -out: - return ret; -} - -static int dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, - dma_addr_t dma_addr, size_t size, int flags) -{ - int rc; - - rc = __dma_update_trans(zdev, pa, dma_addr, size, flags); - if (rc) - return rc; - - rc = __dma_purge_tlb(zdev, dma_addr, size, flags); - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) - __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID); - - return rc; -} - -void dma_free_seg_table(unsigned long entry) -{ - unsigned long *sto = get_rt_sto(entry); - int sx; - - for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++) - if (reg_entry_isvalid(sto[sx])) - dma_free_page_table(get_st_pto(sto[sx])); - - dma_free_cpu_table(sto); -} - -void dma_cleanup_tables(unsigned long *table) -{ - int rtx; - - if (!table) - return; - - for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) - if (reg_entry_isvalid(table[rtx])) - dma_free_seg_table(table[rtx]); - - dma_free_cpu_table(table); -} - -static unsigned long __dma_alloc_iommu(struct device *dev, - unsigned long start, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - - return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages, - start, size, zdev->start_dma >> PAGE_SHIFT, - dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT), - 0); -} - -static dma_addr_t dma_alloc_address(struct device *dev, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long offset, flags; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); - offset = __dma_alloc_iommu(dev, zdev->next_bit, size); - if (offset == -1) { - if (!s390_iommu_strict) { - /* global flush before DMA addresses are reused */ - if (zpci_refresh_global(zdev)) - goto out_error; - - bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, - zdev->lazy_bitmap, zdev->iommu_pages); - bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); - } - /* wrap-around */ - offset = __dma_alloc_iommu(dev, 0, size); - if (offset == -1) - goto out_error; - } - zdev->next_bit = offset + size; - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - - return zdev->start_dma + offset * PAGE_SIZE; - -out_error: - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - return DMA_MAPPING_ERROR; -} - -static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long flags, offset; - - offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); - if (!zdev->iommu_bitmap) - goto out; - - if (s390_iommu_strict) - bitmap_clear(zdev->iommu_bitmap, offset, size); - else - bitmap_set(zdev->lazy_bitmap, offset, size); - -out: - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); -} - -static inline void zpci_err_dma(unsigned long rc, unsigned long addr) -{ - struct { - unsigned long rc; - unsigned long addr; - } __packed data = {rc, addr}; - - zpci_err_hex(&data, sizeof(data)); -} - -static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long pa = page_to_phys(page) + offset; - int flags = ZPCI_PTE_VALID; - unsigned long nr_pages; - dma_addr_t dma_addr; - int ret; - - /* This rounds up number of pages based on size and offset */ - nr_pages = iommu_num_pages(pa, size, PAGE_SIZE); - dma_addr = dma_alloc_address(dev, nr_pages); - if (dma_addr == DMA_MAPPING_ERROR) { - ret = -ENOSPC; - goto out_err; - } - - /* Use rounded up size */ - size = nr_pages * PAGE_SIZE; - - if (direction == DMA_NONE || direction == DMA_TO_DEVICE) - flags |= ZPCI_TABLE_PROTECTED; - - ret = dma_update_trans(zdev, pa, dma_addr, size, flags); - if (ret) - goto out_free; - - atomic64_add(nr_pages, &zdev->mapped_pages); - return dma_addr + (offset & ~PAGE_MASK); - -out_free: - dma_free_address(dev, dma_addr, nr_pages); -out_err: - zpci_err("map error:\n"); - zpci_err_dma(ret, pa); - return DMA_MAPPING_ERROR; -} - -static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - int npages, ret; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - dma_addr = dma_addr & PAGE_MASK; - ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE, - ZPCI_PTE_INVALID); - if (ret) { - zpci_err("unmap error:\n"); - zpci_err_dma(ret, dma_addr); - return; - } - - atomic64_add(npages, &zdev->unmapped_pages); - dma_free_address(dev, dma_addr, npages); -} - -static void *s390_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - struct page *page; - phys_addr_t pa; - dma_addr_t map; - - size = PAGE_ALIGN(size); - page = alloc_pages(flag | __GFP_ZERO, get_order(size)); - if (!page) - return NULL; - - pa = page_to_phys(page); - map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); - if (dma_mapping_error(dev, map)) { - __free_pages(page, get_order(size)); - return NULL; - } - - atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages); - if (dma_handle) - *dma_handle = map; - return phys_to_virt(pa); -} - -static void s390_dma_free(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - - size = PAGE_ALIGN(size); - atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages); - s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0); - free_pages((unsigned long)vaddr, get_order(size)); -} - -/* Map a segment into a contiguous dma address area */ -static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - size_t size, dma_addr_t *handle, - enum dma_data_direction dir) -{ - unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - dma_addr_t dma_addr_base, dma_addr; - int flags = ZPCI_PTE_VALID; - struct scatterlist *s; - phys_addr_t pa = 0; - int ret; - - dma_addr_base = dma_alloc_address(dev, nr_pages); - if (dma_addr_base == DMA_MAPPING_ERROR) - return -ENOMEM; - - dma_addr = dma_addr_base; - if (dir == DMA_NONE || dir == DMA_TO_DEVICE) - flags |= ZPCI_TABLE_PROTECTED; - - for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { - pa = page_to_phys(sg_page(s)); - ret = __dma_update_trans(zdev, pa, dma_addr, - s->offset + s->length, flags); - if (ret) - goto unmap; - - dma_addr += s->offset + s->length; - } - ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); - if (ret) - goto unmap; - - *handle = dma_addr_base; - atomic64_add(nr_pages, &zdev->mapped_pages); - - return ret; - -unmap: - dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, - ZPCI_PTE_INVALID); - dma_free_address(dev, dma_addr_base, nr_pages); - zpci_err("map error:\n"); - zpci_err_dma(ret, pa); - return ret; -} - -static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s = sg, *start = sg, *dma = sg; - unsigned int max = dma_get_max_seg_size(dev); - unsigned int size = s->offset + s->length; - unsigned int offset = s->offset; - int count = 0, i, ret; - - for (i = 1; i < nr_elements; i++) { - s = sg_next(s); - - s->dma_length = 0; - - if (s->offset || (size & ~PAGE_MASK) || - size + s->length > max) { - ret = __s390_dma_map_sg(dev, start, size, - &dma->dma_address, dir); - if (ret) - goto unmap; - - dma->dma_address += offset; - dma->dma_length = size - offset; - - size = offset = s->offset; - start = s; - dma = sg_next(dma); - count++; - } - size += s->length; - } - ret = __s390_dma_map_sg(dev, start, size, &dma->dma_address, dir); - if (ret) - goto unmap; - - dma->dma_address += offset; - dma->dma_length = size - offset; - - return count + 1; -unmap: - for_each_sg(sg, s, count, i) - s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s), - dir, attrs); - - return ret; -} - -static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nr_elements, i) { - if (s->dma_length) - s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, - dir, attrs); - s->dma_address = 0; - s->dma_length = 0; - } -} - -static unsigned long *bitmap_vzalloc(size_t bits, gfp_t flags) -{ - size_t n = BITS_TO_LONGS(bits); - size_t bytes; - - if (unlikely(check_mul_overflow(n, sizeof(unsigned long), &bytes))) - return NULL; - - return vzalloc(bytes); -} - -int zpci_dma_init_device(struct zpci_dev *zdev) -{ - u8 status; - int rc; - - /* - * At this point, if the device is part of an IOMMU domain, this would - * be a strong hint towards a bug in the IOMMU API (common) code and/or - * simultaneous access via IOMMU and DMA API. So let's issue a warning. - */ - WARN_ON(zdev->s390_domain); - - spin_lock_init(&zdev->iommu_bitmap_lock); - - zdev->dma_table = dma_alloc_cpu_table(GFP_KERNEL); - if (!zdev->dma_table) { - rc = -ENOMEM; - goto out; - } - - /* - * Restrict the iommu bitmap size to the minimum of the following: - * - s390_iommu_aperture which defaults to high_memory - * - 3-level pagetable address limit minus start_dma offset - * - DMA address range allowed by the hardware (clp query pci fn) - * - * Also set zdev->end_dma to the actual end address of the usable - * range, instead of the theoretical maximum as reported by hardware. - * - * This limits the number of concurrently usable DMA mappings since - * for each DMA mapped memory address we need a DMA address including - * extra DMA addresses for multiple mappings of the same memory address. - */ - zdev->start_dma = PAGE_ALIGN(zdev->start_dma); - zdev->iommu_size = min3(s390_iommu_aperture, - ZPCI_TABLE_SIZE_RT - zdev->start_dma, - zdev->end_dma - zdev->start_dma + 1); - zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; - zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT; - zdev->iommu_bitmap = bitmap_vzalloc(zdev->iommu_pages, GFP_KERNEL); - if (!zdev->iommu_bitmap) { - rc = -ENOMEM; - goto free_dma_table; - } - if (!s390_iommu_strict) { - zdev->lazy_bitmap = bitmap_vzalloc(zdev->iommu_pages, GFP_KERNEL); - if (!zdev->lazy_bitmap) { - rc = -ENOMEM; - goto free_bitmap; - } - - } - if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table), &status)) { - rc = -EIO; - goto free_bitmap; - } - - return 0; -free_bitmap: - vfree(zdev->iommu_bitmap); - zdev->iommu_bitmap = NULL; - vfree(zdev->lazy_bitmap); - zdev->lazy_bitmap = NULL; -free_dma_table: - dma_free_cpu_table(zdev->dma_table); - zdev->dma_table = NULL; -out: - return rc; -} - -int zpci_dma_exit_device(struct zpci_dev *zdev) -{ - int cc = 0; - - /* - * At this point, if the device is part of an IOMMU domain, this would - * be a strong hint towards a bug in the IOMMU API (common) code and/or - * simultaneous access via IOMMU and DMA API. So let's issue a warning. - */ - WARN_ON(zdev->s390_domain); - if (zdev_enabled(zdev)) - cc = zpci_unregister_ioat(zdev, 0); - /* - * cc == 3 indicates the function is gone already. This can happen - * if the function was deconfigured/disabled suddenly and we have not - * received a new handle yet. - */ - if (cc && cc != 3) - return -EIO; - - dma_cleanup_tables(zdev->dma_table); - zdev->dma_table = NULL; - vfree(zdev->iommu_bitmap); - zdev->iommu_bitmap = NULL; - vfree(zdev->lazy_bitmap); - zdev->lazy_bitmap = NULL; - zdev->next_bit = 0; - return 0; -} - -static int __init dma_alloc_cpu_table_caches(void) -{ - dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables", - ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN, - 0, NULL); - if (!dma_region_table_cache) - return -ENOMEM; - - dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables", - ZPCI_PT_SIZE, ZPCI_PT_ALIGN, - 0, NULL); - if (!dma_page_table_cache) { - kmem_cache_destroy(dma_region_table_cache); - return -ENOMEM; - } - return 0; -} - -int __init zpci_dma_init(void) -{ - s390_iommu_aperture = (u64)virt_to_phys(high_memory); - if (!s390_iommu_aperture_factor) - s390_iommu_aperture = ULONG_MAX; - else - s390_iommu_aperture *= s390_iommu_aperture_factor; - - return dma_alloc_cpu_table_caches(); -} - -void zpci_dma_exit(void) -{ - kmem_cache_destroy(dma_page_table_cache); - kmem_cache_destroy(dma_region_table_cache); -} - -const struct dma_map_ops s390_pci_dma_ops = { - .alloc = s390_dma_alloc, - .free = s390_dma_free, - .map_sg = s390_dma_map_sg, - .unmap_sg = s390_dma_unmap_sg, - .map_page = s390_dma_map_pages, - .unmap_page = s390_dma_unmap_pages, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, - .free_pages = dma_common_free_pages, - /* dma_supported is unconditionally true without a callback */ -}; -EXPORT_SYMBOL_GPL(s390_pci_dma_ops); - -static int __init s390_iommu_setup(char *str) -{ - if (!strcmp(str, "strict")) - s390_iommu_strict = 1; - return 1; -} - -__setup("s390_iommu=", s390_iommu_setup); - -static int __init s390_iommu_aperture_setup(char *str) -{ - if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) - s390_iommu_aperture_factor = 1; - return 1; -} - -__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d969f36bf186f295898249ec36f1b22d51a946eb..6fadc3c5d3cf6a72d8133ba6625fb39ab701d7f3 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -59,9 +59,16 @@ static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) } } -static bool is_passed_through(struct zpci_dev *zdev) +static bool is_passed_through(struct pci_dev *pdev) { - return zdev->s390_domain; + struct zpci_dev *zdev = to_zpci(pdev); + bool ret; + + mutex_lock(&zdev->kzdev_lock); + ret = !!zdev->kzdev; + mutex_unlock(&zdev->kzdev_lock); + + return ret; } static bool is_driver_supported(struct pci_driver *driver) @@ -180,7 +187,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) } pdev->error_state = pci_channel_io_frozen; - if (is_passed_through(to_zpci(pdev))) { + if (is_passed_through(pdev)) { pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", pci_name(pdev)); goto out_unlock; @@ -243,7 +250,7 @@ static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) * we will inject the error event and let the guest recover the device * itself. */ - if (is_passed_through(to_zpci(pdev))) + if (is_passed_through(pdev)) goto out; driver = to_pci_driver(pdev->dev.driver); if (driver && driver->err_handler && driver->err_handler->error_detected) @@ -311,8 +318,6 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) /* Even though the device is already gone we still * need to free zPCI resources as part of the disable. */ - if (zdev->dma_table) - zpci_dma_exit_device(zdev); if (zdev_enabled(zdev)) zpci_disable_device(zdev); zdev->state = ZPCI_FN_STATE_STANDBY; diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index cae280e5c047d1d5eaa405c86b5e8444350961c1..8a7abac5181645d6635ed95e1f5706942948c642 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -56,6 +56,7 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, struct pci_dev *pdev = to_pci_dev(dev); struct zpci_dev *zdev = to_zpci(pdev); int ret = 0; + u8 status; /* Can't use device_remove_self() here as that would lead us to lock * the pci_rescan_remove_lock while holding the device' kernfs lock. @@ -82,12 +83,6 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, pci_lock_rescan_remove(); if (pci_dev_is_added(pdev)) { pci_stop_and_remove_bus_device(pdev); - if (zdev->dma_table) { - ret = zpci_dma_exit_device(zdev); - if (ret) - goto out; - } - if (zdev_enabled(zdev)) { ret = zpci_disable_device(zdev); /* @@ -105,14 +100,16 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, ret = zpci_enable_device(zdev); if (ret) goto out; - ret = zpci_dma_init_device(zdev); - if (ret) { - zpci_disable_device(zdev); - goto out; + + if (zdev->dma_table) { + ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + if (ret) + zpci_disable_device(zdev); } - pci_rescan_bus(zdev->zbus->bus); } out: + pci_rescan_bus(zdev->zbus->bus); pci_unlock_rescan_remove(); if (kn) sysfs_unbreak_active_protection(kn); diff --git a/drivers/crypto/ccp/hygon/hct.c b/drivers/crypto/ccp/hygon/hct.c index e78aa9bad463cf637fbe3a47b8bb768950ea2c9a..6f6893914a19f85253ead713547cb75484ed945f 100644 --- a/drivers/crypto/ccp/hygon/hct.c +++ b/drivers/crypto/ccp/hygon/hct.c @@ -2061,11 +2061,8 @@ static int hct_share_init(void) if (!ret) { hct_data.domain = iommu_domain_alloc(&pci_bus_type); if (!hct_data.domain) { + pr_err("iommu domain alloc failed\n"); misc_deregister(&hct_misc); - if (!pci_bus_type.iommu_ops) { - pr_err("iommu is disabled\n"); - return -ENODEV; - } return -ENOMEM; } hct_data.prot = IOMMU_READ | IOMMU_WRITE; diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index eb7d85a8f648176ed02d4201e95b919a3b9fe6b6..9bf45d11be107561d078287a2eea3166c9ae1bd5 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -95,7 +95,7 @@ config IOMMU_DEBUGFS choice prompt "IOMMU default domain type" depends on IOMMU_API - default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64 + default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64 || S390 default IOMMU_DEFAULT_DMA_STRICT help Choose the type of IOMMU domain used to manage DMA API usage by @@ -150,7 +150,7 @@ config OF_IOMMU # IOMMU-agnostic DMA-mapping layer config IOMMU_DMA - def_bool ARM64 || IA64 || X86 || LOONGARCH + def_bool ARM64 || IA64 || X86 || LOONGARCH || S390 select DMA_OPS select IOMMU_API select IOMMU_IOVA diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 8bd4c3b183ec6e475b58a1990d7b5c33ab141120..443b2c13c37b5979ae574c6c7ab13ac2ce6b60f1 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -23,15 +23,6 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. -config AMD_IOMMU_V2 - tristate "AMD IOMMU Version 2 driver" - depends on AMD_IOMMU - select MMU_NOTIFIER - help - This option enables support for the AMD IOMMUv2 features of the IOMMU - hardware. Select this option if you want to use devices that support - the PCI PRI and PASID interface. - config AMD_IOMMU_DEBUGFS bool "Enable AMD IOMMU internals in DebugFS" depends on AMD_IOMMU && IOMMU_DEBUGFS diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 773d8aa002837eebce6d25c3cfea5e9b28bb85c3..f454fbb1569eb9d792338178faf9611ccb870635 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o -obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 04293c65c1640c6b52df4fef61cf542d08ec1762..978d3060a67e10d6c126425f26404f7867c3e0fb 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -38,9 +38,6 @@ extern int amd_iommu_guest_ir; extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; -/* IOMMUv2 specific functions */ -struct iommu_domain; - bool amd_iommu_v2_supported(void); struct amd_iommu *get_amd_iommu(unsigned int idx); u8 amd_iommu_pc_get_max_banks(unsigned int idx); @@ -51,10 +48,10 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value); -int amd_iommu_register_ppr_notifier(struct notifier_block *nb); -int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); -void amd_iommu_domain_direct_map(struct iommu_domain *dom); -int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); +/* Device capabilities */ +int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); +void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev); + int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); @@ -87,9 +84,25 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev) (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); } -static inline bool iommu_feature(struct amd_iommu *iommu, u64 mask) +static inline bool check_feature(u64 mask) +{ + return (amd_iommu_efr & mask); +} + +static inline bool check_feature2(u64 mask) +{ + return (amd_iommu_efr2 & mask); +} + +static inline int check_feature_gpt_level(void) +{ + return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); +} + +static inline bool amd_iommu_gt_ppr_supported(void) { - return !!(iommu->features & mask); + return (check_feature(FEATURE_GT) && + check_feature(FEATURE_PPR)); } static inline u64 iommu_virt_to_phys(void *vaddr) @@ -105,7 +118,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) static inline void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) { - atomic64_set(&domain->iop.pt_root, root); domain->iop.root = (u64 *)(root & PAGE_MASK); domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */ } @@ -161,8 +173,5 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); -extern u64 amd_iommu_efr; -extern u64 amd_iommu_efr2; - extern bool amd_iommu_snp_en; #endif diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index dec4e5c2b66b8236fcd6faeb8497fdc9b42dfe20..90b7d7950a9efa032f116a448db3c96fb7570004 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -462,6 +462,10 @@ #define PD_IOMMUV2_MASK BIT(3) /* domain has gcr3 table */ #define PD_GIOV_MASK BIT(4) /* domain enable GIOV support */ +/* Timeout stuff */ +#define LOOP_TIMEOUT 100000 +#define MMIO_STATUS_TIMEOUT 2000000 + extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ do { \ @@ -516,19 +520,6 @@ extern struct kmem_cache *amd_iommu_irq_cache; #define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) #define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) -/* - * This struct is used to pass information about - * incoming PPR faults around. - */ -struct amd_iommu_fault { - u64 address; /* IO virtual address of the fault*/ - u32 pasid; /* Address space identifier */ - u32 sbdf; /* Originating PCI device id */ - u16 tag; /* PPR tag */ - u16 flags; /* Fault flags */ - -}; - struct amd_iommu; struct iommu_domain; @@ -555,7 +546,6 @@ struct amd_io_pgtable { struct io_pgtable iop; int mode; u64 *root; - atomic64_t pt_root; /* pgtable root and pgtable mode */ u64 *pgd; /* v2 pgtable pgd pointer */ }; @@ -688,9 +678,6 @@ struct amd_iommu { /* Extended features 2 */ u64 features2; - /* IOMMUv2 */ - bool is_iommu_v2; - /* PCI device id of the IOMMU device */ u16 devid; @@ -811,6 +798,14 @@ struct devid_map { bool cmd_line; }; +#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP 0x1 /* ATS feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP 0x2 /* PRI feature supported */ +#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP 0x4 /* PASID context supported */ +/* Device may request execution on memory pages */ +#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP 0x8 +/* Device may request super-user privileges */ +#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP 0x10 + /* * This struct contains device specific data for the IOMMU */ @@ -823,13 +818,15 @@ struct iommu_dev_data { struct protection_domain *domain; /* Domain the device is bound to */ struct device *dev; u16 devid; /* PCI Device ID */ - bool iommu_v2; /* Device can make use of IOMMUv2 */ - struct { - bool enabled; - int qdep; - } ats; /* ATS state */ - bool pri_tlp; /* PASID TLB required for + + u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */ + int ats_qdep; + u8 ats_enabled :1; /* ATS state */ + u8 pri_enabled :1; /* PRI state */ + u8 pasid_enabled:1; /* PASID state */ + u8 pri_tlp :1; /* PASID TLB required for PPR completions */ + u8 ppr :1; /* Enable device PPR support */ bool use_vapic; /* Enable device to use vapic mode */ bool defer_attach; @@ -896,16 +893,15 @@ extern unsigned amd_iommu_aperture_order; /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* Smallest max PASID supported by any IOMMU in the system */ -extern u32 amd_iommu_max_pasid; - -extern bool amd_iommu_v2_present; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* Global EFR and EFR2 registers */ +extern u64 amd_iommu_efr; +extern u64 amd_iommu_efr2; + /* * This function flushes all internal caches of * the IOMMU used by this driver. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index e3d4a23b66fd4ee69a9485781e70a9e21648b01d..ad5070e33af28fe16fd52c12638d0bf74e098bf0 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -83,8 +83,6 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 -#define LOOP_TIMEOUT 2000000 - #define IVRS_GET_SBDF_ID(seg, bus, dev, fn) (((seg & 0xffff) << 16) | ((bus & 0xff) << 8) \ | ((dev & 0x1f) << 3) | (fn & 0x7)) @@ -187,9 +185,6 @@ static int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; bool amd_iommu_iotlb_sup __read_mostly = true; -u32 amd_iommu_max_pasid __read_mostly = ~0; - -bool amd_iommu_v2_present __read_mostly; static bool amd_iommu_pc_present __read_mostly; bool amdr_ivrs_remap_support __read_mostly; @@ -272,7 +267,7 @@ int amd_iommu_get_num_iommus(void) * Iterate through all the IOMMUs to get common EFR * masks among all IOMMUs and warn if found inconsistency. */ -static void get_global_efr(void) +static __init void get_global_efr(void) { struct amd_iommu *iommu; @@ -304,16 +299,6 @@ static void get_global_efr(void) pr_info("Using global IVHD EFR:%#llx, EFR2:%#llx\n", amd_iommu_efr, amd_iommu_efr2); } -static bool check_feature_on_all_iommus(u64 mask) -{ - return !!(amd_iommu_efr & mask); -} - -static inline int check_feature_gpt_level(void) -{ - return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); -} - /* * For IVHD type 0x11/0x40, EFR is also available via IVHD. * Default to IVHD EFR since it is available sooner @@ -399,7 +384,7 @@ static void iommu_set_cwwb_range(struct amd_iommu *iommu) u64 start = iommu_virt_to_phys((void *)iommu->cmd_sem); u64 entry = start & PM_ADDR_MASK; - if (!check_feature_on_all_iommus(FEATURE_SNP)) + if (!check_feature(FEATURE_SNP)) return; /* Note: @@ -869,7 +854,7 @@ static void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, void *buf = (void *)__get_free_pages(gfp, order); if (buf && - check_feature_on_all_iommus(FEATURE_SNP) && + check_feature(FEATURE_SNP) && set_memory_4k((unsigned long)buf, (1 << order))) { free_pages((unsigned long)buf, order); buf = NULL; @@ -985,14 +970,14 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_GAINT_EN); iommu_feature_enable(iommu, CONTROL_GALOG_EN); - for (i = 0; i < LOOP_TIMEOUT; ++i) { + for (i = 0; i < MMIO_STATUS_TIMEOUT; ++i) { status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; udelay(10); } - if (WARN_ON(i >= LOOP_TIMEOUT)) + if (WARN_ON(i >= MMIO_STATUS_TIMEOUT)) return -EINVAL; return 0; @@ -1048,7 +1033,7 @@ static void iommu_enable_xt(struct amd_iommu *iommu) static void iommu_enable_gt(struct amd_iommu *iommu) { - if (!iommu_feature(iommu, FEATURE_GT)) + if (!check_feature(FEATURE_GT)) return; iommu_feature_enable(iommu, CONTROL_GT_EN); @@ -1996,7 +1981,7 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu) u64 val; struct pci_dev *pdev = iommu->dev; - if (!iommu_feature(iommu, FEATURE_PC)) + if (!check_feature(FEATURE_PC)) return; amd_iommu_pc_present = true; @@ -2023,8 +2008,7 @@ static ssize_t amd_iommu_show_features(struct device *dev, struct device_attribute *attr, char *buf) { - struct amd_iommu *iommu = dev_to_amd_iommu(dev); - return sysfs_emit(buf, "%llx:%llx\n", iommu->features2, iommu->features); + return sysfs_emit(buf, "%llx:%llx\n", amd_iommu_efr, amd_iommu_efr2); } static DEVICE_ATTR(features, S_IRUGO, amd_iommu_show_features, NULL); @@ -2060,9 +2044,9 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu) features = readq(iommu->mmio_base + MMIO_EXT_FEATURES); features2 = readq(iommu->mmio_base + MMIO_EXT_FEATURES2); - if (!iommu->features) { - iommu->features = features; - iommu->features2 = features2; + if (!amd_iommu_efr) { + amd_iommu_efr = features; + amd_iommu_efr2 = features2; return; } @@ -2070,12 +2054,12 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu) * Sanity check and warn if EFR values from * IVHD and MMIO conflict. */ - if (features != iommu->features || - features2 != iommu->features2) { + if (features != amd_iommu_efr || + features2 != amd_iommu_efr2) { pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx), EFR2 (%#llx : %#llx).\n", - features, iommu->features, - features2, iommu->features2); + features, amd_iommu_efr, + features2, amd_iommu_efr2); } } @@ -2104,20 +2088,17 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) late_iommu_features_init(iommu); - if (iommu_feature(iommu, FEATURE_GT)) { + if (check_feature(FEATURE_GT)) { int glxval; - u32 max_pasid; u64 pasmax; - pasmax = iommu->features & FEATURE_PASID_MASK; + pasmax = amd_iommu_efr & FEATURE_PASID_MASK; pasmax >>= FEATURE_PASID_SHIFT; - max_pasid = (1 << (pasmax + 1)) - 1; + iommu->iommu.max_pasids = (1 << (pasmax + 1)) - 1; - amd_iommu_max_pasid = min(amd_iommu_max_pasid, max_pasid); + BUG_ON(iommu->iommu.max_pasids & ~PASID_MASK); - BUG_ON(amd_iommu_max_pasid & ~PASID_MASK); - - glxval = iommu->features & FEATURE_GLXVAL_MASK; + glxval = amd_iommu_efr & FEATURE_GLXVAL_MASK; glxval >>= FEATURE_GLXVAL_SHIFT; if (amd_iommu_max_glx_val == -1) @@ -2126,13 +2107,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval); } - if (iommu_feature(iommu, FEATURE_GT) && - iommu_feature(iommu, FEATURE_PPR)) { - iommu->is_iommu_v2 = true; - amd_iommu_v2_present = true; - } - - if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu)) + if (check_feature(FEATURE_PPR) && alloc_ppr_log(iommu)) return -ENOMEM; if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) { @@ -2144,13 +2119,10 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!iommu_feature(iommu, FEATURE_GIOSUP) || - !iommu_feature(iommu, FEATURE_GT)) { + if (!check_feature(FEATURE_GIOSUP) || + !check_feature(FEATURE_GT)) { pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); amd_iommu_pgtable = AMD_IOMMU_V1; - } else if (iommu_default_passthrough()) { - pr_warn("V2 page table doesn't support passthrough mode. Fallback to v1.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; } } @@ -2198,35 +2170,29 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) static void print_iommu_info(void) { + int i; static const char * const feat_str[] = { "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", "IA", "GA", "HE", "PC" }; - struct amd_iommu *iommu; - - for_each_iommu(iommu) { - struct pci_dev *pdev = iommu->dev; - int i; - pci_info(pdev, "Found IOMMU cap 0x%x\n", iommu->cap_ptr); + if (amd_iommu_efr) { + pr_info("Extended features (%#llx, %#llx):", amd_iommu_efr, amd_iommu_efr2); - if (iommu->cap & (1 << IOMMU_CAP_EFR)) { - pr_info("Extended features (%#llx, %#llx):", iommu->features, iommu->features2); - - for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { - if (iommu_feature(iommu, (1ULL << i))) - pr_cont(" %s", feat_str[i]); - } + for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { + if (check_feature(1ULL << i)) + pr_cont(" %s", feat_str[i]); + } - if (iommu->features & FEATURE_GAM_VAPIC) - pr_cont(" GA_vAPIC"); + if (check_feature(FEATURE_GAM_VAPIC)) + pr_cont(" GA_vAPIC"); - if (iommu->features & FEATURE_SNP) - pr_cont(" SNP"); + if (check_feature(FEATURE_SNP)) + pr_cont(" SNP"); - pr_cont("\n"); - } + pr_cont("\n"); } + if (irq_remapping_enabled) { pr_info("Interrupt remapping enabled\n"); if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE) @@ -2912,19 +2878,19 @@ static void enable_iommus_vapic(void) * Need to set and poll check the GALOGRun bit to zero before * we can set/ modify GA Log registers safely. */ - for (i = 0; i < LOOP_TIMEOUT; ++i) { + for (i = 0; i < MMIO_STATUS_TIMEOUT; ++i) { status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (!(status & MMIO_STATUS_GALOG_RUN_MASK)) break; udelay(10); } - if (WARN_ON(i >= LOOP_TIMEOUT)) + if (WARN_ON(i >= MMIO_STATUS_TIMEOUT)) return; } if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && - !check_feature_on_all_iommus(FEATURE_GAM_VAPIC)) { + !check_feature(FEATURE_GAM_VAPIC)) { amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; return; } @@ -3726,9 +3692,8 @@ bool amd_iommu_v2_supported(void) * (i.e. EFR[SNPSup]=1), IOMMUv2 page table cannot be used without * setting up IOMMUv1 page table. */ - return amd_iommu_v2_present && !amd_iommu_snp_en; + return amd_iommu_gt_ppr_supported() && !amd_iommu_snp_en; } -EXPORT_SYMBOL(amd_iommu_v2_supported); struct amd_iommu *get_amd_iommu(unsigned int idx) { @@ -3852,7 +3817,7 @@ int amd_iommu_snp_enable(void) return -EINVAL; } - amd_iommu_snp_en = check_feature_on_all_iommus(FEATURE_SNP); + amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) return -EINVAL; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 6c0777a3c57b79467b6b6d8c23bc97e0ff227619..4823c702290cec5acf9909ba8679213014535a74 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -363,10 +363,10 @@ static void v2_free_pgtable(struct io_pgtable *iop) if (!(pdom->flags & PD_IOMMUV2_MASK)) return; - /* - * Make changes visible to IOMMUs. No need to clear gcr3 entry - * as gcr3 table is already freed. - */ + /* Clear gcr3 entry */ + amd_iommu_domain_clear_gcr3(&pdom->domain, 0); + + /* Make changes visible to IOMMUs */ amd_iommu_domain_update(pdom); /* Free page table */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 1ebee66f99496e6e5b577d9e51c6550faf8db9a3..2906a279b991bbfb04c7182a6b835b4b07adf83c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -45,8 +45,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define LOOP_TIMEOUT 100000 - /* IO virtual address start page frame number */ #define IOVA_START_PFN (1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) @@ -66,9 +64,8 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -const struct iommu_dirty_ops amd_dirty_ops; +static const struct iommu_dirty_ops amd_dirty_ops; -static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; /* @@ -81,7 +78,6 @@ struct iommu_cmd { struct kmem_cache *amd_iommu_irq_cache; static void detach_device(struct device *dev); -static int domain_enable_v2(struct protection_domain *domain, int pasids); /**************************************************************************** * @@ -324,24 +320,141 @@ static struct iommu_group *acpihid_device_group(struct device *dev) return entry->group; } -static bool pci_iommuv2_capable(struct pci_dev *pdev) +static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) { - static const int caps[] = { - PCI_EXT_CAP_ID_PRI, - PCI_EXT_CAP_ID_PASID, - }; - int i, pos; + return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); +} - if (!pci_ats_supported(pdev)) - return false; +static u32 pdev_get_caps(struct pci_dev *pdev) +{ + int features; + u32 flags = 0; + + if (pci_ats_supported(pdev)) + flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; + + if (pci_pri_supported(pdev)) + flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; + + features = pci_pasid_features(pdev); + if (features >= 0) { + flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + + if (features & PCI_PASID_CAP_EXEC) + flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; - for (i = 0; i < 2; ++i) { - pos = pci_find_ext_capability(pdev, caps[i]); - if (pos == 0) - return false; + if (features & PCI_PASID_CAP_PRIV) + flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; } - return true; + return flags; +} + +static inline int pdev_enable_cap_ats(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->ats_enabled) + return 0; + + if (amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { + ret = pci_enable_ats(pdev, PAGE_SHIFT); + if (!ret) { + dev_data->ats_enabled = 1; + dev_data->ats_qdep = pci_ats_queue_depth(pdev); + } + } + + return ret; +} + +static inline void pdev_disable_cap_ats(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->ats_enabled) { + pci_disable_ats(pdev); + dev_data->ats_enabled = 0; + } +} + +int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->pri_enabled) + return 0; + + if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { + /* + * First reset the PRI state of the device. + * FIXME: Hardcode number of outstanding requests for now + */ + if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { + dev_data->pri_enabled = 1; + dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); + + ret = 0; + } + } + + return ret; +} + +void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->pri_enabled) { + pci_disable_pri(pdev); + dev_data->pri_enabled = 0; + } +} + +static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + int ret = -EINVAL; + + if (dev_data->pasid_enabled) + return 0; + + if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { + /* Only allow access to user-accessible pages */ + ret = pci_enable_pasid(pdev, 0); + if (!ret) + dev_data->pasid_enabled = 1; + } + + return ret; +} + +static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); + + if (dev_data->pasid_enabled) { + pci_disable_pasid(pdev); + dev_data->pasid_enabled = 0; + } +} + +static void pdev_enable_caps(struct pci_dev *pdev) +{ + pdev_enable_cap_ats(pdev); + pdev_enable_cap_pasid(pdev); + amd_iommu_pdev_enable_cap_pri(pdev); + +} + +static void pdev_disable_caps(struct pci_dev *pdev) +{ + pdev_disable_cap_ats(pdev); + pdev_disable_cap_pasid(pdev); + amd_iommu_pdev_disable_cap_pri(pdev); } /* @@ -401,8 +514,8 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) * it'll be forced to go into translation mode. */ if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && - dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) { - dev_data->iommu_v2 = iommu->is_iommu_v2; + dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { + dev_data->flags = pdev_get_caps(to_pci_dev(dev)); } dev_iommu_priv_set(dev, dev_data); @@ -701,24 +814,6 @@ static void iommu_poll_events(struct amd_iommu *iommu) writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } -static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) -{ - struct amd_iommu_fault fault; - - if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { - pr_err_ratelimited("Unknown PPR request received\n"); - return; - } - - fault.address = raw[1]; - fault.pasid = PPR_PASID(raw[0]); - fault.sbdf = PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, PPR_DEVID(raw[0])); - fault.tag = PPR_TAG(raw[0]); - fault.flags = PPR_FLAGS(raw[0]); - - atomic_notifier_call_chain(&ppr_notifier, 0, &fault); -} - static void iommu_poll_ppr_log(struct amd_iommu *iommu) { u32 head, tail; @@ -764,8 +859,7 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); - /* Handle PPR entry */ - iommu_handle_ppr_entry(iommu, entry); + /* TODO: PPR Handler will be added when we add IOPF support */ /* Refresh ring-buffer information */ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); @@ -1102,7 +1196,7 @@ static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, } static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int status, int tag, bool gn) + int status, int tag, u8 gn) { memset(cmd, 0, sizeof(*cmd)); @@ -1306,7 +1400,7 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) void iommu_flush_all_caches(struct amd_iommu *iommu) { - if (iommu_feature(iommu, FEATURE_IA)) { + if (check_feature(FEATURE_IA)) { amd_iommu_flush_all(iommu); } else { amd_iommu_flush_dte_all(iommu); @@ -1324,7 +1418,7 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); struct iommu_cmd cmd; - int qdep = dev_data->ats.qdep; + int qdep = dev_data->ats_qdep; build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); @@ -1368,7 +1462,7 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) return ret; } - if (dev_data->ats.enabled) + if (dev_data->ats_enabled) ret = device_flush_iotlb(dev_data, 0, ~0UL); return ret; @@ -1401,7 +1495,7 @@ static void __domain_flush_pages(struct protection_domain *domain, list_for_each_entry(dev_data, &domain->dev_list, list) { - if (!dev_data->ats.enabled) + if (!dev_data->ats_enabled) continue; ret |= device_flush_iotlb(dev_data, address, size); @@ -1577,6 +1671,42 @@ static void free_gcr3_table(struct protection_domain *domain) free_page((unsigned long)domain->gcr3_tbl); } +/* + * Number of GCR3 table levels required. Level must be 4-Kbyte + * page and can contain up to 512 entries. + */ +static int get_gcr3_levels(int pasids) +{ + int levels; + + if (pasids == -1) + return amd_iommu_max_glx_val; + + levels = get_count_order(pasids); + + return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; +} + +/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ +static int setup_gcr3_table(struct protection_domain *domain, int pasids) +{ + int levels = get_gcr3_levels(pasids); + + if (levels > amd_iommu_max_glx_val) + return -EINVAL; + + domain->gcr3_tbl = alloc_pgtable_page(domain->nid, GFP_ATOMIC); + if (domain->gcr3_tbl == NULL) + return -ENOMEM; + + domain->glx = levels; + domain->flags |= PD_IOMMUV2_MASK; + + amd_iommu_domain_update(domain); + + return 0; +} + static void set_dte_entry(struct amd_iommu *iommu, u16 devid, struct protection_domain *domain, bool ats, bool ppr) { @@ -1605,10 +1735,8 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, if (ats) flags |= DTE_FLAG_IOTLB; - if (ppr) { - if (iommu_feature(iommu, FEATURE_EPHSUP)) - pte_root |= 1ULL << DEV_ENTRY_PPR; - } + if (ppr) + pte_root |= 1ULL << DEV_ENTRY_PPR; if (domain->dirty_tracking) pte_root |= DTE_FLAG_HAD; @@ -1685,7 +1813,7 @@ static void do_attach(struct iommu_dev_data *dev_data, struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); bool ats; - ats = dev_data->ats.enabled; + ats = dev_data->ats_enabled; /* Update data structures */ dev_data->domain = domain; @@ -1701,7 +1829,7 @@ static void do_attach(struct iommu_dev_data *dev_data, /* Update device table */ set_dte_entry(iommu, dev_data->devid, domain, - ats, dev_data->iommu_v2); + ats, dev_data->ppr); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); @@ -1732,48 +1860,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -static void pdev_iommuv2_disable(struct pci_dev *pdev) -{ - pci_disable_ats(pdev); - pci_disable_pri(pdev); - pci_disable_pasid(pdev); -} - -static int pdev_pri_ats_enable(struct pci_dev *pdev) -{ - int ret; - - /* Only allow access to user-accessible pages */ - ret = pci_enable_pasid(pdev, 0); - if (ret) - return ret; - - /* First reset the PRI state of the device */ - ret = pci_reset_pri(pdev); - if (ret) - goto out_err_pasid; - - /* Enable PRI */ - /* FIXME: Hardcode number of outstanding requests for now */ - ret = pci_enable_pri(pdev, 32); - if (ret) - goto out_err_pasid; - - ret = pci_enable_ats(pdev, PAGE_SHIFT); - if (ret) - goto out_err_pri; - - return 0; - -out_err_pri: - pci_disable_pri(pdev); - -out_err_pasid: - pci_disable_pasid(pdev); - - return ret; -} - /* * If a device is not yet associated with a domain, this function makes the * device visible in the domain @@ -1782,9 +1868,8 @@ static int attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; - struct pci_dev *pdev; unsigned long flags; - int ret; + int ret = 0; spin_lock_irqsave(&domain->lock, flags); @@ -1792,45 +1877,13 @@ static int attach_device(struct device *dev, spin_lock(&dev_data->lock); - ret = -EBUSY; - if (dev_data->domain != NULL) + if (dev_data->domain != NULL) { + ret = -EBUSY; goto out; - - if (!dev_is_pci(dev)) - goto skip_ats_check; - - pdev = to_pci_dev(dev); - if (domain->flags & PD_IOMMUV2_MASK) { - struct iommu_domain *def_domain = iommu_get_dma_domain(dev); - - ret = -EINVAL; - - /* - * In case of using AMD_IOMMU_V1 page table mode and the device - * is enabling for PPR/ATS support (using v2 table), - * we need to make sure that the domain type is identity map. - */ - if ((amd_iommu_pgtable == AMD_IOMMU_V1) && - def_domain->type != IOMMU_DOMAIN_IDENTITY) { - goto out; - } - - if (dev_data->iommu_v2) { - if (pdev_pri_ats_enable(pdev) != 0) - goto out; - - dev_data->ats.enabled = true; - dev_data->ats.qdep = pci_ats_queue_depth(pdev); - dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); - } - } else if (amd_iommu_iotlb_sup && - pci_enable_ats(pdev, PAGE_SHIFT) == 0) { - dev_data->ats.enabled = true; - dev_data->ats.qdep = pci_ats_queue_depth(pdev); } -skip_ats_check: - ret = 0; + if (dev_is_pci(dev)) + pdev_enable_caps(to_pci_dev(dev)); do_attach(dev_data, domain); @@ -1878,15 +1931,8 @@ static void detach_device(struct device *dev) do_detach(dev_data); - if (!dev_is_pci(dev)) - goto out; - - if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) - pdev_iommuv2_disable(to_pci_dev(dev)); - else if (dev_data->ats.enabled) - pci_disable_ats(to_pci_dev(dev)); - - dev_data->ats.enabled = false; + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); out: spin_unlock(&dev_data->lock); @@ -1974,7 +2020,7 @@ static void update_device_table(struct protection_domain *domain) struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); set_dte_entry(iommu, dev_data->devid, domain, - dev_data->ats.enabled, dev_data->iommu_v2); + dev_data->ats_enabled, dev_data->ppr); clone_aliases(iommu, dev_data->dev); } } @@ -2008,9 +2054,11 @@ void amd_iommu_domain_update(struct protection_domain *domain) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; - unsigned long flags; - spin_lock_irqsave(&domain->lock, flags); + lockdep_assert_held(&domain->lock); + + if (!domain->dev_cnt) + return; while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2018,8 +2066,7 @@ static void cleanup_domain(struct protection_domain *domain) BUG_ON(!entry->domain); do_detach(entry); } - - spin_unlock_irqrestore(&domain->lock, flags); + WARN_ON(domain->dev_cnt != 0); } static void protection_domain_free(struct protection_domain *domain) @@ -2030,6 +2077,12 @@ static void protection_domain_free(struct protection_domain *domain) if (domain->iop.pgtbl_cfg.tlb) free_io_pgtable_ops(&domain->iop.iop.ops); + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); + + if (domain->iop.root) + free_page((unsigned long)domain->iop.root); + if (domain->id) domain_id_free(domain->id); @@ -2042,18 +2095,10 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); - spin_lock_init(&domain->lock); - domain->id = domain_id_alloc(); - if (!domain->id) - return -ENOMEM; - INIT_LIST_HEAD(&domain->dev_list); - if (mode != PAGE_MODE_NONE) { pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!pt_root) { - domain_id_free(domain->id); + if (!pt_root) return -ENOMEM; - } } amd_iommu_domain_set_pgtable(domain, pt_root, mode); @@ -2063,20 +2108,12 @@ static int protection_domain_init_v1(struct protection_domain *domain, int mode) static int protection_domain_init_v2(struct protection_domain *domain) { - spin_lock_init(&domain->lock); - domain->id = domain_id_alloc(); - if (!domain->id) - return -ENOMEM; - INIT_LIST_HEAD(&domain->dev_list); - domain->flags |= PD_GIOV_MASK; domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - if (domain_enable_v2(domain, 1)) { - domain_id_free(domain->id); + if (setup_gcr3_table(domain, 1)) return -ENOMEM; - } return 0; } @@ -2086,57 +2123,60 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; int pgtable; - int mode = DEFAULT_PGTABLE_LEVEL; int ret; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_err; + + spin_lock_init(&domain->lock); + INIT_LIST_HEAD(&domain->dev_list); + domain->nid = NUMA_NO_NODE; + + switch (type) { + /* No need to allocate io pgtable ops in passthrough mode */ + case IOMMU_DOMAIN_IDENTITY: + return domain; + case IOMMU_DOMAIN_DMA: + pgtable = amd_iommu_pgtable; + break; /* - * Force IOMMU v1 page table when iommu=pt and - * when allocating domain for pass-through devices. + * Force IOMMU v1 page table when allocating + * domain for pass-through devices. */ - if (type == IOMMU_DOMAIN_IDENTITY) { + case IOMMU_DOMAIN_UNMANAGED: pgtable = AMD_IOMMU_V1; - mode = PAGE_MODE_NONE; - } else if (type == IOMMU_DOMAIN_UNMANAGED) { - pgtable = AMD_IOMMU_V1; - } else if (type == IOMMU_DOMAIN_DMA || type == IOMMU_DOMAIN_DMA_FQ) { - pgtable = amd_iommu_pgtable; - } else { - return NULL; + break; + default: + goto out_err; } - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - switch (pgtable) { case AMD_IOMMU_V1: - ret = protection_domain_init_v1(domain, mode); + ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL); break; case AMD_IOMMU_V2: ret = protection_domain_init_v2(domain); break; default: ret = -EINVAL; + break; } if (ret) goto out_err; - /* No need to allocate io pgtable ops in passthrough mode */ - if (type == IOMMU_DOMAIN_IDENTITY) - return domain; - - domain->nid = NUMA_NO_NODE; - pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain); - if (!pgtbl_ops) { - domain_id_free(domain->id); + if (!pgtbl_ops) goto out_err; - } return domain; out_err: - kfree(domain); + protection_domain_free(domain); return NULL; } @@ -2218,12 +2258,15 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) return domain; } -static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev, - u32 flags) +static struct iommu_domain * +amd_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) + { unsigned int type = IOMMU_DOMAIN_UNMANAGED; - if (flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) return ERR_PTR(-EOPNOTSUPP); return do_iommu_domain_alloc(type, dev, flags); @@ -2232,19 +2275,18 @@ static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev, static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; + unsigned long flags; - domain = to_pdomain(dom); + if (!dom) + return; - if (domain->dev_cnt > 0) - cleanup_domain(domain); + domain = to_pdomain(dom); - BUG_ON(domain->dev_cnt != 0); + spin_lock_irqsave(&domain->lock, flags); - if (!dom) - return; + cleanup_domain(domain); - if (domain->flags & PD_IOMMUV2_MASK) - free_gcr3_table(domain); + spin_unlock_irqrestore(&domain->lock, flags); protection_domain_free(domain); } @@ -2292,14 +2334,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) { struct protection_domain *domain = to_pdomain(dom); struct io_pgtable_ops *ops = &domain->iop.iop.ops; if (ops->map_pages) domain_flush_np_cache(domain, iova, size); + return 0; } static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, @@ -2533,7 +2576,6 @@ bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred); static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { @@ -2573,7 +2615,7 @@ static int amd_iommu_def_domain_type(struct device *dev) * and require remapping. * - SNP is enabled, because it prohibits DTE[Mode]=0. */ - if (dev_data->iommu_v2 && + if (pdev_pasid_supported(dev_data) && !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && !amd_iommu_snp_en) { return IOMMU_DOMAIN_IDENTITY; @@ -2588,7 +2630,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -const struct iommu_dirty_ops amd_dirty_ops = { +static const struct iommu_dirty_ops amd_dirty_ops = { .set_dirty_tracking = amd_iommu_set_dirty_tracking, .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, }; @@ -2618,93 +2660,6 @@ const struct iommu_ops amd_iommu_ops = { } }; -/***************************************************************************** - * - * The next functions do a basic initialization of IOMMU for pass through - * mode - * - * In passthrough mode the IOMMU is initialized and enabled but not used for - * DMA-API translation. - * - *****************************************************************************/ - -/* IOMMUv2 specific functions */ -int amd_iommu_register_ppr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&ppr_notifier, nb); -} -EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); - -int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&ppr_notifier, nb); -} -EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); - -void amd_iommu_domain_direct_map(struct iommu_domain *dom) -{ - struct protection_domain *domain = to_pdomain(dom); - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - - if (domain->iop.pgtbl_cfg.tlb) - free_io_pgtable_ops(&domain->iop.iop.ops); - - spin_unlock_irqrestore(&domain->lock, flags); -} -EXPORT_SYMBOL(amd_iommu_domain_direct_map); - -/* Note: This function expects iommu_domain->lock to be held prior calling the function. */ -static int domain_enable_v2(struct protection_domain *domain, int pasids) -{ - int levels; - - /* Number of GCR3 table levels required */ - for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) - levels += 1; - - if (levels > amd_iommu_max_glx_val) - return -EINVAL; - - domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); - if (domain->gcr3_tbl == NULL) - return -ENOMEM; - - domain->glx = levels; - domain->flags |= PD_IOMMUV2_MASK; - - amd_iommu_domain_update(domain); - - return 0; -} - -int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) -{ - struct protection_domain *pdom = to_pdomain(dom); - unsigned long flags; - int ret; - - spin_lock_irqsave(&pdom->lock, flags); - - /* - * Save us all sanity checks whether devices already in the - * domain support IOMMUv2. Just force that the domain has no - * devices attached when it is switched into IOMMUv2 mode. - */ - ret = -EBUSY; - if (pdom->dev_cnt > 0 || pdom->flags & PD_IOMMUV2_MASK) - goto out; - - if (!pdom->gcr3_tbl) - ret = domain_enable_v2(pdom, pasids); - -out: - spin_unlock_irqrestore(&pdom->lock, flags); - return ret; -} -EXPORT_SYMBOL(amd_iommu_domain_enable_v2); - static int __flush_pasid(struct protection_domain *domain, u32 pasid, u64 address, bool size) { @@ -2742,10 +2697,10 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, There might be non-IOMMUv2 capable devices in an IOMMUv2 * domain. */ - if (!dev_data->ats.enabled) + if (!dev_data->ats_enabled) continue; - qdep = dev_data->ats.qdep; + qdep = dev_data->ats_qdep; iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) continue; @@ -2786,7 +2741,6 @@ int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, return ret; } -EXPORT_SYMBOL(amd_iommu_flush_page); static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid) { @@ -2806,7 +2760,6 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) return ret; } -EXPORT_SYMBOL(amd_iommu_flush_tlb); static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) { @@ -2886,7 +2839,6 @@ int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, return ret; } -EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) { @@ -2900,7 +2852,6 @@ int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid) return ret; } -EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, int status, int tag) @@ -2917,49 +2868,6 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, return iommu_queue_command(iommu, &cmd); } -EXPORT_SYMBOL(amd_iommu_complete_ppr); - -int amd_iommu_device_info(struct pci_dev *pdev, - struct amd_iommu_device_info *info) -{ - int max_pasids; - int pos; - - if (pdev == NULL || info == NULL) - return -EINVAL; - - if (!amd_iommu_v2_supported()) - return -EINVAL; - - memset(info, 0, sizeof(*info)); - - if (pci_ats_supported(pdev)) - info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); - if (pos) - info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); - if (pos) { - int features; - - max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); - max_pasids = min(max_pasids, (1 << 20)); - - info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; - info->max_pasids = min(pci_max_pasids(pdev), max_pasids); - - features = pci_pasid_features(pdev); - if (features & PCI_PASID_CAP_EXEC) - info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; - if (features & PCI_PASID_CAP_PRIV) - info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; - } - - return 0; -} -EXPORT_SYMBOL(amd_iommu_device_info); #ifdef CONFIG_IRQ_REMAP diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c deleted file mode 100644 index 57c2fb1146e25f113ce7f782e8cdf1e63abd31c9..0000000000000000000000000000000000000000 --- a/drivers/iommu/amd/iommu_v2.c +++ /dev/null @@ -1,996 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2010-2012 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "amd_iommu.h" - -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Joerg Roedel "); - -#define PRI_QUEUE_SIZE 512 - -struct pri_queue { - atomic_t inflight; - bool finish; - int status; -}; - -struct pasid_state { - struct list_head list; /* For global state-list */ - refcount_t count; /* Reference count */ - unsigned mmu_notifier_count; /* Counting nested mmu_notifier - calls */ - struct mm_struct *mm; /* mm_struct for the faults */ - struct mmu_notifier mn; /* mmu_notifier handle */ - struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ - struct device_state *device_state; /* Link to our device_state */ - u32 pasid; /* PASID index */ - bool invalid; /* Used during setup and - teardown of the pasid */ - spinlock_t lock; /* Protect pri_queues and - mmu_notifer_count */ - wait_queue_head_t wq; /* To wait for count == 0 */ -}; - -struct device_state { - struct list_head list; - u32 sbdf; - atomic_t count; - struct pci_dev *pdev; - struct pasid_state **states; - struct iommu_domain *domain; - int pasid_levels; - int max_pasids; - amd_iommu_invalid_ppr_cb inv_ppr_cb; - amd_iommu_invalidate_ctx inv_ctx_cb; - spinlock_t lock; - wait_queue_head_t wq; -}; - -struct fault { - struct work_struct work; - struct device_state *dev_state; - struct pasid_state *state; - struct mm_struct *mm; - u64 address; - u32 pasid; - u16 tag; - u16 finish; - u16 flags; -}; - -static LIST_HEAD(state_list); -static DEFINE_SPINLOCK(state_lock); - -static struct workqueue_struct *iommu_wq; - -static void free_pasid_states(struct device_state *dev_state); - -static struct device_state *__get_device_state(u32 sbdf) -{ - struct device_state *dev_state; - - list_for_each_entry(dev_state, &state_list, list) { - if (dev_state->sbdf == sbdf) - return dev_state; - } - - return NULL; -} - -static struct device_state *get_device_state(u32 sbdf) -{ - struct device_state *dev_state; - unsigned long flags; - - spin_lock_irqsave(&state_lock, flags); - dev_state = __get_device_state(sbdf); - if (dev_state != NULL) - atomic_inc(&dev_state->count); - spin_unlock_irqrestore(&state_lock, flags); - - return dev_state; -} - -static void free_device_state(struct device_state *dev_state) -{ - struct iommu_group *group; - - /* Get rid of any remaining pasid states */ - free_pasid_states(dev_state); - - /* - * Wait until the last reference is dropped before freeing - * the device state. - */ - wait_event(dev_state->wq, !atomic_read(&dev_state->count)); - - /* - * First detach device from domain - No more PRI requests will arrive - * from that device after it is unbound from the IOMMUv2 domain. - */ - group = iommu_group_get(&dev_state->pdev->dev); - if (WARN_ON(!group)) - return; - - iommu_detach_group(dev_state->domain, group); - - iommu_group_put(group); - - /* Everything is down now, free the IOMMUv2 domain */ - iommu_domain_free(dev_state->domain); - - /* Finally get rid of the device-state */ - kfree(dev_state); -} - -static void put_device_state(struct device_state *dev_state) -{ - if (atomic_dec_and_test(&dev_state->count)) - wake_up(&dev_state->wq); -} - -/* Must be called under dev_state->lock */ -static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state, - u32 pasid, bool alloc) -{ - struct pasid_state **root, **ptr; - int level, index; - - level = dev_state->pasid_levels; - root = dev_state->states; - - while (true) { - - index = (pasid >> (9 * level)) & 0x1ff; - ptr = &root[index]; - - if (level == 0) - break; - - if (*ptr == NULL) { - if (!alloc) - return NULL; - - *ptr = (void *)get_zeroed_page(GFP_ATOMIC); - if (*ptr == NULL) - return NULL; - } - - root = (struct pasid_state **)*ptr; - level -= 1; - } - - return ptr; -} - -static int set_pasid_state(struct device_state *dev_state, - struct pasid_state *pasid_state, - u32 pasid) -{ - struct pasid_state **ptr; - unsigned long flags; - int ret; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, true); - - ret = -ENOMEM; - if (ptr == NULL) - goto out_unlock; - - ret = -ENOMEM; - if (*ptr != NULL) - goto out_unlock; - - *ptr = pasid_state; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); - - return ret; -} - -static void clear_pasid_state(struct device_state *dev_state, u32 pasid) -{ - struct pasid_state **ptr; - unsigned long flags; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, true); - - if (ptr == NULL) - goto out_unlock; - - *ptr = NULL; - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); -} - -static struct pasid_state *get_pasid_state(struct device_state *dev_state, - u32 pasid) -{ - struct pasid_state **ptr, *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&dev_state->lock, flags); - ptr = __get_pasid_state_ptr(dev_state, pasid, false); - - if (ptr == NULL) - goto out_unlock; - - ret = *ptr; - if (ret) - refcount_inc(&ret->count); - -out_unlock: - spin_unlock_irqrestore(&dev_state->lock, flags); - - return ret; -} - -static void free_pasid_state(struct pasid_state *pasid_state) -{ - kfree(pasid_state); -} - -static void put_pasid_state(struct pasid_state *pasid_state) -{ - if (refcount_dec_and_test(&pasid_state->count)) - wake_up(&pasid_state->wq); -} - -static void put_pasid_state_wait(struct pasid_state *pasid_state) -{ - if (!refcount_dec_and_test(&pasid_state->count)) - wait_event(pasid_state->wq, !refcount_read(&pasid_state->count)); - free_pasid_state(pasid_state); -} - -static void unbind_pasid(struct pasid_state *pasid_state) -{ - struct iommu_domain *domain; - - domain = pasid_state->device_state->domain; - - /* - * Mark pasid_state as invalid, no more faults will we added to the - * work queue after this is visible everywhere. - */ - pasid_state->invalid = true; - - /* Make sure this is visible */ - smp_wmb(); - - /* After this the device/pasid can't access the mm anymore */ - amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid); - - /* Make sure no more pending faults are in the queue */ - flush_workqueue(iommu_wq); -} - -static void free_pasid_states_level1(struct pasid_state **tbl) -{ - int i; - - for (i = 0; i < 512; ++i) { - if (tbl[i] == NULL) - continue; - - free_page((unsigned long)tbl[i]); - } -} - -static void free_pasid_states_level2(struct pasid_state **tbl) -{ - struct pasid_state **ptr; - int i; - - for (i = 0; i < 512; ++i) { - if (tbl[i] == NULL) - continue; - - ptr = (struct pasid_state **)tbl[i]; - free_pasid_states_level1(ptr); - } -} - -static void free_pasid_states(struct device_state *dev_state) -{ - struct pasid_state *pasid_state; - int i; - - for (i = 0; i < dev_state->max_pasids; ++i) { - pasid_state = get_pasid_state(dev_state, i); - if (pasid_state == NULL) - continue; - - put_pasid_state(pasid_state); - - /* Clear the pasid state so that the pasid can be re-used */ - clear_pasid_state(dev_state, pasid_state->pasid); - - /* - * This will call the mn_release function and - * unbind the PASID - */ - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state_wait(pasid_state); /* Reference taken in - amd_iommu_bind_pasid */ - - /* Drop reference taken in amd_iommu_bind_pasid */ - put_device_state(dev_state); - } - - if (dev_state->pasid_levels == 2) - free_pasid_states_level2(dev_state->states); - else if (dev_state->pasid_levels == 1) - free_pasid_states_level1(dev_state->states); - else - BUG_ON(dev_state->pasid_levels != 0); - - free_page((unsigned long)dev_state->states); -} - -static struct pasid_state *mn_to_state(struct mmu_notifier *mn) -{ - return container_of(mn, struct pasid_state, mn); -} - -static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; - - if ((start ^ (end - 1)) < PAGE_SIZE) - amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, - start); - else - amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid); -} - -static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - bool run_inv_ctx_cb; - - might_sleep(); - - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; - run_inv_ctx_cb = !pasid_state->invalid; - - if (run_inv_ctx_cb && dev_state->inv_ctx_cb) - dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid); - - unbind_pasid(pasid_state); -} - -static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, -}; - -static void set_pri_tag_status(struct pasid_state *pasid_state, - u16 tag, int status) -{ - unsigned long flags; - - spin_lock_irqsave(&pasid_state->lock, flags); - pasid_state->pri[tag].status = status; - spin_unlock_irqrestore(&pasid_state->lock, flags); -} - -static void finish_pri_tag(struct device_state *dev_state, - struct pasid_state *pasid_state, - u16 tag) -{ - unsigned long flags; - - spin_lock_irqsave(&pasid_state->lock, flags); - if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) && - pasid_state->pri[tag].finish) { - amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid, - pasid_state->pri[tag].status, tag); - pasid_state->pri[tag].finish = false; - pasid_state->pri[tag].status = PPR_SUCCESS; - } - spin_unlock_irqrestore(&pasid_state->lock, flags); -} - -static void handle_fault_error(struct fault *fault) -{ - int status; - - if (!fault->dev_state->inv_ppr_cb) { - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - return; - } - - status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, - fault->pasid, - fault->address, - fault->flags); - switch (status) { - case AMD_IOMMU_INV_PRI_RSP_SUCCESS: - set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); - break; - case AMD_IOMMU_INV_PRI_RSP_INVALID: - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - break; - case AMD_IOMMU_INV_PRI_RSP_FAIL: - set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); - break; - default: - BUG(); - } -} - -static bool access_error(struct vm_area_struct *vma, struct fault *fault) -{ - unsigned long requested = 0; - - if (fault->flags & PPR_FAULT_EXEC) - requested |= VM_EXEC; - - if (fault->flags & PPR_FAULT_READ) - requested |= VM_READ; - - if (fault->flags & PPR_FAULT_WRITE) - requested |= VM_WRITE; - - return (requested & ~vma->vm_flags) != 0; -} - -static void do_fault(struct work_struct *work) -{ - struct fault *fault = container_of(work, struct fault, work); - struct vm_area_struct *vma; - vm_fault_t ret = VM_FAULT_ERROR; - unsigned int flags = 0; - struct mm_struct *mm; - u64 address; - - mm = fault->state->mm; - address = fault->address; - - if (fault->flags & PPR_FAULT_USER) - flags |= FAULT_FLAG_USER; - if (fault->flags & PPR_FAULT_WRITE) - flags |= FAULT_FLAG_WRITE; - flags |= FAULT_FLAG_REMOTE; - - mmap_read_lock(mm); - vma = vma_lookup(mm, address); - if (!vma) - /* failed to get a vma in the right range */ - goto out; - - /* Check if we have the right permissions on the vma */ - if (access_error(vma, fault)) - goto out; - - ret = handle_mm_fault(vma, address, flags, NULL); -out: - mmap_read_unlock(mm); - - if (ret & VM_FAULT_ERROR) - /* failed to service fault */ - handle_fault_error(fault); - - finish_pri_tag(fault->dev_state, fault->state, fault->tag); - - put_pasid_state(fault->state); - - kfree(fault); -} - -static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) -{ - struct amd_iommu_fault *iommu_fault; - struct pasid_state *pasid_state; - struct device_state *dev_state; - struct pci_dev *pdev = NULL; - unsigned long flags; - struct fault *fault; - bool finish; - u16 tag, devid, seg_id; - int ret; - - iommu_fault = data; - tag = iommu_fault->tag & 0x1ff; - finish = (iommu_fault->tag >> 9) & 1; - - seg_id = PCI_SBDF_TO_SEGID(iommu_fault->sbdf); - devid = PCI_SBDF_TO_DEVID(iommu_fault->sbdf); - pdev = pci_get_domain_bus_and_slot(seg_id, PCI_BUS_NUM(devid), - devid & 0xff); - if (!pdev) - return -ENODEV; - - ret = NOTIFY_DONE; - - /* In kdump kernel pci dev is not initialized yet -> send INVALID */ - if (amd_iommu_is_attach_deferred(&pdev->dev)) { - amd_iommu_complete_ppr(pdev, iommu_fault->pasid, - PPR_INVALID, tag); - goto out; - } - - dev_state = get_device_state(iommu_fault->sbdf); - if (dev_state == NULL) - goto out; - - pasid_state = get_pasid_state(dev_state, iommu_fault->pasid); - if (pasid_state == NULL || pasid_state->invalid) { - /* We know the device but not the PASID -> send INVALID */ - amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid, - PPR_INVALID, tag); - goto out_drop_state; - } - - spin_lock_irqsave(&pasid_state->lock, flags); - atomic_inc(&pasid_state->pri[tag].inflight); - if (finish) - pasid_state->pri[tag].finish = true; - spin_unlock_irqrestore(&pasid_state->lock, flags); - - fault = kzalloc(sizeof(*fault), GFP_ATOMIC); - if (fault == NULL) { - /* We are OOM - send success and let the device re-fault */ - finish_pri_tag(dev_state, pasid_state, tag); - goto out_drop_state; - } - - fault->dev_state = dev_state; - fault->address = iommu_fault->address; - fault->state = pasid_state; - fault->tag = tag; - fault->finish = finish; - fault->pasid = iommu_fault->pasid; - fault->flags = iommu_fault->flags; - INIT_WORK(&fault->work, do_fault); - - queue_work(iommu_wq, &fault->work); - - ret = NOTIFY_OK; - -out_drop_state: - - if (ret != NOTIFY_OK && pasid_state) - put_pasid_state(pasid_state); - - put_device_state(dev_state); - -out: - pci_dev_put(pdev); - return ret; -} - -static struct notifier_block ppr_nb = { - .notifier_call = ppr_notifier, -}; - -int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid, - struct task_struct *task) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - struct mm_struct *mm; - u32 sbdf; - int ret; - - might_sleep(); - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - dev_state = get_device_state(sbdf); - - if (dev_state == NULL) - return -EINVAL; - - ret = -EINVAL; - if (pasid >= dev_state->max_pasids) - goto out; - - ret = -ENOMEM; - pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL); - if (pasid_state == NULL) - goto out; - - - refcount_set(&pasid_state->count, 1); - init_waitqueue_head(&pasid_state->wq); - spin_lock_init(&pasid_state->lock); - - mm = get_task_mm(task); - pasid_state->mm = mm; - pasid_state->device_state = dev_state; - pasid_state->pasid = pasid; - pasid_state->invalid = true; /* Mark as valid only if we are - done with setting up the pasid */ - pasid_state->mn.ops = &iommu_mn; - - if (pasid_state->mm == NULL) - goto out_free; - - ret = mmu_notifier_register(&pasid_state->mn, mm); - if (ret) - goto out_free; - - ret = set_pasid_state(dev_state, pasid_state, pasid); - if (ret) - goto out_unregister; - - ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, - __pa(pasid_state->mm->pgd)); - if (ret) - goto out_clear_state; - - /* Now we are ready to handle faults */ - pasid_state->invalid = false; - - /* - * Drop the reference to the mm_struct here. We rely on the - * mmu_notifier release call-back to inform us when the mm - * is going away. - */ - mmput(mm); - - return 0; - -out_clear_state: - clear_pasid_state(dev_state, pasid); - -out_unregister: - mmu_notifier_unregister(&pasid_state->mn, mm); - mmput(mm); - -out_free: - free_pasid_state(pasid_state); - -out: - put_device_state(dev_state); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_bind_pasid); - -void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid) -{ - struct pasid_state *pasid_state; - struct device_state *dev_state; - u32 sbdf; - - might_sleep(); - - if (!amd_iommu_v2_supported()) - return; - - sbdf = get_pci_sbdf_id(pdev); - dev_state = get_device_state(sbdf); - if (dev_state == NULL) - return; - - if (pasid >= dev_state->max_pasids) - goto out; - - pasid_state = get_pasid_state(dev_state, pasid); - if (pasid_state == NULL) - goto out; - /* - * Drop reference taken here. We are safe because we still hold - * the reference taken in the amd_iommu_bind_pasid function. - */ - put_pasid_state(pasid_state); - - /* Clear the pasid state so that the pasid can be re-used */ - clear_pasid_state(dev_state, pasid_state->pasid); - - /* - * Call mmu_notifier_unregister to drop our reference - * to pasid_state->mm - */ - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state_wait(pasid_state); /* Reference taken in - amd_iommu_bind_pasid */ -out: - /* Drop reference taken in this function */ - put_device_state(dev_state); - - /* Drop reference taken in amd_iommu_bind_pasid */ - put_device_state(dev_state); -} -EXPORT_SYMBOL(amd_iommu_unbind_pasid); - -int amd_iommu_init_device(struct pci_dev *pdev, int pasids) -{ - struct device_state *dev_state; - struct iommu_group *group; - unsigned long flags; - int ret, tmp; - u32 sbdf; - - might_sleep(); - - /* - * When memory encryption is active the device is likely not in a - * direct-mapped domain. Forbid using IOMMUv2 functionality for now. - */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return -ENODEV; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - if (pasids <= 0 || pasids > (PASID_MASK + 1)) - return -EINVAL; - - sbdf = get_pci_sbdf_id(pdev); - - dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL); - if (dev_state == NULL) - return -ENOMEM; - - spin_lock_init(&dev_state->lock); - init_waitqueue_head(&dev_state->wq); - dev_state->pdev = pdev; - dev_state->sbdf = sbdf; - - tmp = pasids; - for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9) - dev_state->pasid_levels += 1; - - atomic_set(&dev_state->count, 1); - dev_state->max_pasids = pasids; - - ret = -ENOMEM; - dev_state->states = (void *)get_zeroed_page(GFP_KERNEL); - if (dev_state->states == NULL) - goto out_free_dev_state; - - dev_state->domain = iommu_domain_alloc(&pci_bus_type); - if (dev_state->domain == NULL) - goto out_free_states; - - /* See iommu_is_default_domain() */ - dev_state->domain->type = IOMMU_DOMAIN_IDENTITY; - amd_iommu_domain_direct_map(dev_state->domain); - - ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids); - if (ret) - goto out_free_domain; - - group = iommu_group_get(&pdev->dev); - if (!group) { - ret = -EINVAL; - goto out_free_domain; - } - - ret = iommu_attach_group(dev_state->domain, group); - if (ret != 0) - goto out_drop_group; - - iommu_group_put(group); - - spin_lock_irqsave(&state_lock, flags); - - if (__get_device_state(sbdf) != NULL) { - spin_unlock_irqrestore(&state_lock, flags); - ret = -EBUSY; - goto out_free_domain; - } - - list_add_tail(&dev_state->list, &state_list); - - spin_unlock_irqrestore(&state_lock, flags); - - return 0; - -out_drop_group: - iommu_group_put(group); - -out_free_domain: - iommu_domain_free(dev_state->domain); - -out_free_states: - free_page((unsigned long)dev_state->states); - -out_free_dev_state: - kfree(dev_state); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_init_device); - -void amd_iommu_free_device(struct pci_dev *pdev) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - - if (!amd_iommu_v2_supported()) - return; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) { - spin_unlock_irqrestore(&state_lock, flags); - return; - } - - list_del(&dev_state->list); - - spin_unlock_irqrestore(&state_lock, flags); - - put_device_state(dev_state); - free_device_state(dev_state); -} -EXPORT_SYMBOL(amd_iommu_free_device); - -int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, - amd_iommu_invalid_ppr_cb cb) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - int ret; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - ret = -EINVAL; - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) - goto out_unlock; - - dev_state->inv_ppr_cb = cb; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&state_lock, flags); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb); - -int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, - amd_iommu_invalidate_ctx cb) -{ - struct device_state *dev_state; - unsigned long flags; - u32 sbdf; - int ret; - - if (!amd_iommu_v2_supported()) - return -ENODEV; - - sbdf = get_pci_sbdf_id(pdev); - - spin_lock_irqsave(&state_lock, flags); - - ret = -EINVAL; - dev_state = __get_device_state(sbdf); - if (dev_state == NULL) - goto out_unlock; - - dev_state->inv_ctx_cb = cb; - - ret = 0; - -out_unlock: - spin_unlock_irqrestore(&state_lock, flags); - - return ret; -} -EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb); - -static int __init amd_iommu_v2_init(void) -{ - int ret; - - if (!amd_iommu_v2_supported()) { - pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n"); - /* - * Load anyway to provide the symbols to other modules - * which may use AMD IOMMUv2 optionally. - */ - return 0; - } - - ret = -ENOMEM; - iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0); - if (iommu_wq == NULL) - goto out; - - amd_iommu_register_ppr_notifier(&ppr_nb); - - pr_info("AMD IOMMUv2 loaded and initialized\n"); - - return 0; - -out: - return ret; -} - -static void __exit amd_iommu_v2_exit(void) -{ - struct device_state *dev_state, *next; - unsigned long flags; - LIST_HEAD(freelist); - - if (!amd_iommu_v2_supported()) - return; - - amd_iommu_unregister_ppr_notifier(&ppr_nb); - - flush_workqueue(iommu_wq); - - /* - * The loop below might call flush_workqueue(), so call - * destroy_workqueue() after it - */ - spin_lock_irqsave(&state_lock, flags); - - list_for_each_entry_safe(dev_state, next, &state_list, list) { - WARN_ON_ONCE(1); - - put_device_state(dev_state); - list_del(&dev_state->list); - list_add_tail(&dev_state->list, &freelist); - } - - spin_unlock_irqrestore(&state_lock, flags); - - /* - * Since free_device_state waits on the count to be zero, - * we need to free dev_state outside the spinlock. - */ - list_for_each_entry_safe(dev_state, next, &freelist, list) { - list_del(&dev_state->list); - free_device_state(dev_state); - } - - destroy_workqueue(iommu_wq); -} - -module_init(amd_iommu_v2_init); -module_exit(amd_iommu_v2_exit); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index b45842c19fe9c26835a195b4ebfb35006f150812..d5204c9f7186f92665e419f0324be39960b79371 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -504,10 +504,11 @@ static void apple_dart_iotlb_sync(struct iommu_domain *domain, apple_dart_domain_flush_tlb(to_dart_domain(domain)); } -static void apple_dart_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int apple_dart_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { apple_dart_domain_flush_tlb(to_dart_domain(domain)); + return 0; } static phys_addr_t apple_dart_iova_to_phys(struct iommu_domain *domain, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 49d7b5a0f7480236665da6780252a447baf6eb5d..dd5628977628c18a75a2604e61526105b3751b9a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2662,9 +2662,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) struct arm_smmu_master *master; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - if (!fwspec || fwspec->ops != &arm_smmu_ops) - return ERR_PTR(-ENODEV); - if (WARN_ON_ONCE(dev_iommu_priv_get(dev))) return ERR_PTR(-EBUSY); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index e6b4bab0dde2e58da8da72348cf24fbbd7b6870f..4521101e648ab151ebe61858a4e4ec8504d60ac6 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -563,6 +563,7 @@ static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,sm6350-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sm6375-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sm6375-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm7150-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sm8150-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sm8250-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sm8350-smmu-500", .data = &qcom_smmu_500_impl0_data }, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index cd5adc0300e31b016ed8343a561e7e40ee3159b8..f029b19764e9a7334cece343f671b8e0282bf907 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1121,11 +1121,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) struct arm_smmu_device *smmu; int ret; - if (!fwspec || fwspec->ops != &arm_smmu_ops) { - dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n"); - return -ENXIO; - } - /* * FIXME: The arch/arm DMA API code tries to attach devices to its own * domains between of_xlate() and probe_device() - we have no way to cope @@ -1362,7 +1357,7 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) fwspec = dev_iommu_fwspec_get(dev); if (ret) goto out_free; - } else if (fwspec && fwspec->ops == &arm_smmu_ops) { + } else { smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); /* @@ -1375,8 +1370,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) if (!smmu) return ERR_PTR(dev_err_probe(dev, -EPROBE_DEFER, "smmu dev has not bound yet\n")); - } else { - return ERR_PTR(-ENODEV); } #ifdef CONFIG_ARCH_PHYTIUM @@ -2195,7 +2188,8 @@ static int arm_smmu_device_probe(struct platform_device *pdev) return err; } - err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev); + err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, + using_legacy_binding ? NULL : dev); if (err) { dev_err(dev, "Failed to register iommu\n"); iommu_device_sysfs_remove(&smmu->iommu); diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 280cf35baed6d4fb84030e3636795a86630a85bd..52f996ef9ca4e13e2618a93bbccc3ac8a01499f7 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -79,16 +79,6 @@ static struct qcom_iommu_domain *to_qcom_iommu_domain(struct iommu_domain *dom) static const struct iommu_ops qcom_iommu_ops; -static struct qcom_iommu_dev * to_iommu(struct device *dev) -{ - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - - if (!fwspec || fwspec->ops != &qcom_iommu_ops) - return NULL; - - return dev_iommu_priv_get(dev); -} - static struct qcom_iommu_ctx * to_ctx(struct qcom_iommu_domain *d, unsigned asid) { struct qcom_iommu_dev *qcom_iommu = d->iommu; @@ -372,7 +362,7 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); int ret; @@ -404,7 +394,7 @@ static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; if (domain == identity_domain || !domain) @@ -535,7 +525,7 @@ static bool qcom_iommu_capable(struct device *dev, enum iommu_cap cap) static struct iommu_device *qcom_iommu_probe_device(struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct device_link *link; if (!qcom_iommu) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index c8f20efce9343f587c838ff2cdc008ff2dbc8d81..c2926bd4f8fa1f1dd0df5ddc97e79ed58fd27c10 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -44,14 +44,28 @@ enum iommu_dma_cookie_type { IOMMU_DMA_MSI_COOKIE, }; +enum iommu_dma_queue_type { + IOMMU_DMA_OPTS_PER_CPU_QUEUE, + IOMMU_DMA_OPTS_SINGLE_QUEUE, +}; + +struct iommu_dma_options { + enum iommu_dma_queue_type qt; + size_t fq_size; + unsigned int fq_timeout; +}; + struct iommu_dma_cookie { enum iommu_dma_cookie_type type; union { /* Full allocator for IOMMU_DMA_IOVA_COOKIE */ struct { struct iova_domain iovad; - - struct iova_fq __percpu *fq; /* Flush queue */ + /* Flush queue */ + union { + struct iova_fq *single_fq; + struct iova_fq __percpu *percpu_fq; + }; /* Number of TLB flushes that have been started */ atomic64_t fq_flush_start_cnt; /* Number of TLB flushes that have been finished */ @@ -68,6 +82,8 @@ struct iommu_dma_cookie { /* Domain for flush queue callback; NULL if flush queue not in use */ struct iommu_domain *fq_domain; + /* Options for dma-iommu use */ + struct iommu_dma_options options; struct mutex mutex; }; @@ -85,10 +101,12 @@ static int __init iommu_dma_forcedac_setup(char *str) early_param("iommu.forcedac", iommu_dma_forcedac_setup); /* Number of entries per flush queue */ -#define IOVA_FQ_SIZE 256 +#define IOVA_DEFAULT_FQ_SIZE 256 +#define IOVA_SINGLE_FQ_SIZE 32768 /* Timeout (in ms) after which entries are flushed from the queue */ -#define IOVA_FQ_TIMEOUT 10 +#define IOVA_DEFAULT_FQ_TIMEOUT 10 +#define IOVA_SINGLE_FQ_TIMEOUT 1000 /* Flush queue entry for deferred flushing */ struct iova_fq_entry { @@ -100,18 +118,19 @@ struct iova_fq_entry { /* Per-CPU flush queue structure */ struct iova_fq { - struct iova_fq_entry entries[IOVA_FQ_SIZE]; - unsigned int head, tail; spinlock_t lock; + unsigned int head, tail; + unsigned int mod_mask; + struct iova_fq_entry entries[]; }; #define fq_ring_for_each(i, fq) \ - for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE) + for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) & (fq)->mod_mask) static inline bool fq_full(struct iova_fq *fq) { assert_spin_locked(&fq->lock); - return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head); + return (((fq->tail + 1) & fq->mod_mask) == fq->head); } static inline unsigned int fq_ring_add(struct iova_fq *fq) @@ -120,12 +139,12 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq) assert_spin_locked(&fq->lock); - fq->tail = (idx + 1) % IOVA_FQ_SIZE; + fq->tail = (idx + 1) & fq->mod_mask; return idx; } -static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq) +static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq) { u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt); unsigned int idx; @@ -142,10 +161,19 @@ static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq) fq->entries[idx].iova_pfn, fq->entries[idx].pages); - fq->head = (fq->head + 1) % IOVA_FQ_SIZE; + fq->head = (fq->head + 1) & fq->mod_mask; } } +static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq) +{ + unsigned long flags; + + spin_lock_irqsave(&fq->lock, flags); + fq_ring_free_locked(cookie, fq); + spin_unlock_irqrestore(&fq->lock, flags); +} + static void fq_flush_iotlb(struct iommu_dma_cookie *cookie) { atomic64_inc(&cookie->fq_flush_start_cnt); @@ -161,14 +189,11 @@ static void fq_flush_timeout(struct timer_list *t) atomic_set(&cookie->fq_timer_on, 0); fq_flush_iotlb(cookie); - for_each_possible_cpu(cpu) { - unsigned long flags; - struct iova_fq *fq; - - fq = per_cpu_ptr(cookie->fq, cpu); - spin_lock_irqsave(&fq->lock, flags); - fq_ring_free(cookie, fq); - spin_unlock_irqrestore(&fq->lock, flags); + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) { + fq_ring_free(cookie, cookie->single_fq); + } else { + for_each_possible_cpu(cpu) + fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu)); } } @@ -189,7 +214,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie, */ smp_mb(); - fq = raw_cpu_ptr(cookie->fq); + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) + fq = cookie->single_fq; + else + fq = raw_cpu_ptr(cookie->percpu_fq); + spin_lock_irqsave(&fq->lock, flags); /* @@ -197,11 +226,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie, * flushed out on another CPU. This makes the fq_full() check below less * likely to be true. */ - fq_ring_free(cookie, fq); + fq_ring_free_locked(cookie, fq); if (fq_full(fq)) { fq_flush_iotlb(cookie); - fq_ring_free(cookie, fq); + fq_ring_free_locked(cookie, fq); } idx = fq_ring_add(fq); @@ -217,34 +246,95 @@ static void queue_iova(struct iommu_dma_cookie *cookie, if (!atomic_read(&cookie->fq_timer_on) && !atomic_xchg(&cookie->fq_timer_on, 1)) mod_timer(&cookie->fq_timer, - jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT)); + jiffies + msecs_to_jiffies(cookie->options.fq_timeout)); } -static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie) +static void iommu_dma_free_fq_single(struct iova_fq *fq) { - int cpu, idx; + int idx; - if (!cookie->fq) - return; + fq_ring_for_each(idx, fq) + put_pages_list(&fq->entries[idx].freelist); + vfree(fq); +} + +static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq) +{ + int cpu, idx; - del_timer_sync(&cookie->fq_timer); /* The IOVAs will be torn down separately, so just free our queued pages */ for_each_possible_cpu(cpu) { - struct iova_fq *fq = per_cpu_ptr(cookie->fq, cpu); + struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu); fq_ring_for_each(idx, fq) put_pages_list(&fq->entries[idx].freelist); } - free_percpu(cookie->fq); + free_percpu(percpu_fq); +} + +static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie) +{ + if (!cookie->fq_domain) + return; + + del_timer_sync(&cookie->fq_timer); + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) + iommu_dma_free_fq_single(cookie->single_fq); + else + iommu_dma_free_fq_percpu(cookie->percpu_fq); +} + +static void iommu_dma_init_one_fq(struct iova_fq *fq, size_t fq_size) +{ + int i; + + fq->head = 0; + fq->tail = 0; + fq->mod_mask = fq_size - 1; + + spin_lock_init(&fq->lock); + + for (i = 0; i < fq_size; i++) + INIT_LIST_HEAD(&fq->entries[i].freelist); +} + +static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie) +{ + size_t fq_size = cookie->options.fq_size; + struct iova_fq *queue; + + queue = vmalloc(struct_size(queue, entries, fq_size)); + if (!queue) + return -ENOMEM; + iommu_dma_init_one_fq(queue, fq_size); + cookie->single_fq = queue; + + return 0; +} + +static int iommu_dma_init_fq_percpu(struct iommu_dma_cookie *cookie) +{ + size_t fq_size = cookie->options.fq_size; + struct iova_fq __percpu *queue; + int cpu; + + queue = __alloc_percpu(struct_size(queue, entries, fq_size), + __alignof__(*queue)); + if (!queue) + return -ENOMEM; + + for_each_possible_cpu(cpu) + iommu_dma_init_one_fq(per_cpu_ptr(queue, cpu), fq_size); + cookie->percpu_fq = queue; + return 0; } /* sysfs updates are serialised by the mutex of the group owning @domain */ int iommu_dma_init_fq(struct iommu_domain *domain) { struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iova_fq __percpu *queue; - int i, cpu; + int rc; if (cookie->fq_domain) return 0; @@ -252,26 +342,16 @@ int iommu_dma_init_fq(struct iommu_domain *domain) atomic64_set(&cookie->fq_flush_start_cnt, 0); atomic64_set(&cookie->fq_flush_finish_cnt, 0); - queue = alloc_percpu(struct iova_fq); - if (!queue) { + if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) + rc = iommu_dma_init_fq_single(cookie); + else + rc = iommu_dma_init_fq_percpu(cookie); + + if (rc) { pr_warn("iova flush queue initialization failed\n"); return -ENOMEM; } - for_each_possible_cpu(cpu) { - struct iova_fq *fq = per_cpu_ptr(queue, cpu); - - fq->head = 0; - fq->tail = 0; - - spin_lock_init(&fq->lock); - - for (i = 0; i < IOVA_FQ_SIZE; i++) - INIT_LIST_HEAD(&fq->entries[i].freelist); - } - - cookie->fq = queue; - timer_setup(&cookie->fq_timer, fq_flush_timeout, 0); atomic_set(&cookie->fq_timer_on, 0); /* @@ -574,6 +654,28 @@ static bool dev_use_sg_swiotlb(struct device *dev, struct scatterlist *sg, return false; } +/** + * iommu_dma_init_options - Initialize dma-iommu options + * @options: The options to be initialized + * @dev: Device the options are set for + * + * This allows tuning dma-iommu specific to device properties + */ +static void iommu_dma_init_options(struct iommu_dma_options *options, + struct device *dev) +{ + /* Shadowing IOTLB flushes do better with a single large queue */ + if (dev->iommu->shadow_on_flush) { + options->qt = IOMMU_DMA_OPTS_SINGLE_QUEUE; + options->fq_timeout = IOVA_SINGLE_FQ_TIMEOUT; + options->fq_size = IOVA_SINGLE_FQ_SIZE; + } else { + options->qt = IOMMU_DMA_OPTS_PER_CPU_QUEUE; + options->fq_size = IOVA_DEFAULT_FQ_SIZE; + options->fq_timeout = IOVA_DEFAULT_FQ_TIMEOUT; + } +} + /** * iommu_dma_init_domain - Initialise a DMA mapping domain * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() @@ -634,6 +736,8 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, if (ret) goto done_unlock; + iommu_dma_init_options(&cookie->options, dev); + /* If the FQ fails we can simply fall back to strict mode */ if (domain->type == IOMMU_DOMAIN_DMA_FQ && (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 05664f823a4999def9ab962a418477c91d68e647..d98c9161948a256597e4cabeda892e503c935c26 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1229,7 +1229,7 @@ static int lv2set_page(sysmmu_pte_t *pent, phys_addr_t paddr, size_t size, */ static int exynos_iommu_map(struct iommu_domain *iommu_domain, unsigned long l_iova, phys_addr_t paddr, size_t size, - int prot, gfp_t gfp) + size_t count, int prot, gfp_t gfp, size_t *mapped) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); sysmmu_pte_t *entry; @@ -1263,6 +1263,8 @@ static int exynos_iommu_map(struct iommu_domain *iommu_domain, if (ret) pr_err("%s: Failed(%d) to map %#zx bytes @ %#x\n", __func__, ret, size, iova); + else + *mapped = size; spin_unlock_irqrestore(&domain->pgtablelock, flags); @@ -1284,7 +1286,7 @@ static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain } static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain, - unsigned long l_iova, size_t size, + unsigned long l_iova, size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); @@ -1477,8 +1479,8 @@ static const struct iommu_ops exynos_iommu_ops = { .of_xlate = exynos_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = exynos_iommu_attach_device, - .map = exynos_iommu_map, - .unmap = exynos_iommu_unmap, + .map_pages = exynos_iommu_map, + .unmap_pages = exynos_iommu_unmap, .iova_to_phys = exynos_iommu_iova_to_phys, .free = exynos_iommu_domain_free, } diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 29d26a4371327c839f4cd23d1452fcba26806638..5402b699a1229ba0299a05c01470cd1c3ec25144 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index 8ab7b3fbe3b1ab8e25278343d319ea2591794bd1..dee61e513be6d44f72d5b2c7071bf4a30e696e5f 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -106,8 +106,13 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8), IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9), IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9), + IOMMU_REGSET_ENTRY(VCCAP), + IOMMU_REGSET_ENTRY(VCMD), + IOMMU_REGSET_ENTRY(VCRSP), }; +static struct dentry *intel_iommu_debug; + static int iommu_regset_show(struct seq_file *m, void *unused) { struct dmar_drhd_unit *drhd; @@ -308,9 +313,14 @@ static inline unsigned long level_to_directory_size(int level) static inline void dump_page_info(struct seq_file *m, unsigned long iova, u64 *path) { - seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n", - iova >> VTD_PAGE_SHIFT, path[5], path[4], - path[3], path[2], path[1]); + seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx", + iova >> VTD_PAGE_SHIFT, path[5], path[4], path[3]); + if (path[2]) { + seq_printf(m, "\t0x%016llx", path[2]); + if (path[1]) + seq_printf(m, "\t0x%016llx", path[1]); + } + seq_putc(m, '\n'); } static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, @@ -337,58 +347,140 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, } } -static int __show_device_domain_translation(struct device *dev, void *data) +static int domain_translation_struct_show(struct seq_file *m, + struct device_domain_info *info, + ioasid_t pasid) { - struct dmar_domain *domain; - struct seq_file *m = data; - u64 path[6] = { 0 }; - - domain = to_dmar_domain(iommu_get_domain_for_dev(dev)); - if (!domain) - return 0; + bool scalable, found = false; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + u16 devfn, bus, seg; - seq_printf(m, "Device %s @0x%llx\n", dev_name(dev), - (u64)virt_to_phys(domain->pgd)); - seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n"); + bus = info->bus; + devfn = info->devfn; + seg = info->segment; - pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); - seq_putc(m, '\n'); + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + struct context_entry *context; + u64 pgd, path[6] = { 0 }; + u32 sts, agaw; - /* Don't iterate */ - return 1; -} + if (seg != iommu->segment) + continue; -static int show_device_domain_translation(struct device *dev, void *data) -{ - struct iommu_group *group; + sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_TES)) { + seq_printf(m, "DMA Remapping is not enabled on %s\n", + iommu->name); + continue; + } + if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) + scalable = true; + else + scalable = false; - group = iommu_group_get(dev); - if (group) { /* - * The group->mutex is held across the callback, which will - * block calls to iommu_attach/detach_group/device. Hence, + * The iommu->lock is held across the callback, which will + * block calls to domain_attach/domain_detach. Hence, * the domain of the device will not change during traversal. * - * All devices in an iommu group share a single domain, hence - * we only dump the domain of the first device. Even though, - * this code still possibly races with the iommu_unmap() + * Traversing page table possibly races with the iommu_unmap() * interface. This could be solved by RCU-freeing the page * table pages in the iommu_unmap() path. */ - iommu_group_for_each_dev(group, data, - __show_device_domain_translation); - iommu_group_put(group); + spin_lock(&iommu->lock); + + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context || !context_present(context)) + goto iommu_unlock; + + if (scalable) { /* scalable mode */ + struct pasid_entry *pasid_tbl, *pasid_tbl_entry; + struct pasid_dir_entry *dir_tbl, *dir_entry; + u16 dir_idx, tbl_idx, pgtt; + u64 pasid_dir_ptr; + + pasid_dir_ptr = context->lo & VTD_PAGE_MASK; + + /* Dump specified device domain mappings with PASID. */ + dir_idx = pasid >> PASID_PDE_SHIFT; + tbl_idx = pasid & PASID_PTE_MASK; + + dir_tbl = phys_to_virt(pasid_dir_ptr); + dir_entry = &dir_tbl[dir_idx]; + + pasid_tbl = get_pasid_table_from_pde(dir_entry); + if (!pasid_tbl) + goto iommu_unlock; + + pasid_tbl_entry = &pasid_tbl[tbl_idx]; + if (!pasid_pte_is_present(pasid_tbl_entry)) + goto iommu_unlock; + + /* + * According to PASID Granular Translation Type(PGTT), + * get the page table pointer. + */ + pgtt = (u16)(pasid_tbl_entry->val[0] & GENMASK_ULL(8, 6)) >> 6; + agaw = (u8)(pasid_tbl_entry->val[0] & GENMASK_ULL(4, 2)) >> 2; + + switch (pgtt) { + case PASID_ENTRY_PGTT_FL_ONLY: + pgd = pasid_tbl_entry->val[2]; + break; + case PASID_ENTRY_PGTT_SL_ONLY: + case PASID_ENTRY_PGTT_NESTED: + pgd = pasid_tbl_entry->val[0]; + break; + default: + goto iommu_unlock; + } + pgd &= VTD_PAGE_MASK; + } else { /* legacy mode */ + pgd = context->lo & VTD_PAGE_MASK; + agaw = context->hi & 7; + } + + seq_printf(m, "Device %04x:%02x:%02x.%x ", + iommu->segment, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + if (scalable) + seq_printf(m, "with pasid %x @0x%llx\n", pasid, pgd); + else + seq_printf(m, "@0x%llx\n", pgd); + + seq_printf(m, "%-17s\t%-18s\t%-18s\t%-18s\t%-18s\t%-s\n", + "IOVA_PFN", "PML5E", "PML4E", "PDPE", "PDE", "PTE"); + pgtable_walk_level(m, phys_to_virt(pgd), agaw + 2, 0, path); + + found = true; +iommu_unlock: + spin_unlock(&iommu->lock); + if (found) + break; } + rcu_read_unlock(); return 0; } -static int domain_translation_struct_show(struct seq_file *m, void *unused) +static int dev_domain_translation_struct_show(struct seq_file *m, void *unused) +{ + struct device_domain_info *info = (struct device_domain_info *)m->private; + + return domain_translation_struct_show(m, info, IOMMU_NO_PASID); +} +DEFINE_SHOW_ATTRIBUTE(dev_domain_translation_struct); + +static int pasid_domain_translation_struct_show(struct seq_file *m, void *unused) { - return bus_for_each_dev(&pci_bus_type, NULL, m, - show_device_domain_translation); + struct dev_pasid_info *dev_pasid = (struct dev_pasid_info *)m->private; + struct device_domain_info *info = dev_iommu_priv_get(dev_pasid->dev); + + return domain_translation_struct_show(m, info, dev_pasid->pasid); } -DEFINE_SHOW_ATTRIBUTE(domain_translation_struct); +DEFINE_SHOW_ATTRIBUTE(pasid_domain_translation_struct); static void invalidation_queue_entry_show(struct seq_file *m, struct intel_iommu *iommu) @@ -663,16 +755,12 @@ static const struct file_operations dmar_perf_latency_fops = { void __init intel_iommu_debugfs_init(void) { - struct dentry *intel_iommu_debug = debugfs_create_dir("intel", - iommu_debugfs_dir); + intel_iommu_debug = debugfs_create_dir("intel", iommu_debugfs_dir); debugfs_create_file("iommu_regset", 0444, intel_iommu_debug, NULL, &iommu_regset_fops); debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug, NULL, &dmar_translation_struct_fops); - debugfs_create_file("domain_translation_struct", 0444, - intel_iommu_debug, NULL, - &domain_translation_struct_fops); debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug, NULL, &invalidation_queue_fops); #ifdef CONFIG_IRQ_REMAP @@ -682,3 +770,51 @@ void __init intel_iommu_debugfs_init(void) debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug, NULL, &dmar_perf_latency_fops); } + +/* + * Create a debugfs directory for each device, and then create a + * debugfs file in this directory for users to dump the page table + * of the default domain. e.g. + * /sys/kernel/debug/iommu/intel/0000:00:01.0/domain_translation_struct + */ +void intel_iommu_debugfs_create_dev(struct device_domain_info *info) +{ + info->debugfs_dentry = debugfs_create_dir(dev_name(info->dev), intel_iommu_debug); + + debugfs_create_file("domain_translation_struct", 0444, info->debugfs_dentry, + info, &dev_domain_translation_struct_fops); +} + +/* Remove the device debugfs directory. */ +void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) +{ + debugfs_remove_recursive(info->debugfs_dentry); +} + +/* + * Create a debugfs directory per pair of {device, pasid}, then create the + * corresponding debugfs file in this directory for users to dump its page + * table. e.g. + * /sys/kernel/debug/iommu/intel/0000:00:01.0/1/domain_translation_struct + * + * The debugfs only dumps the page tables whose mappings are created and + * destroyed by the iommu_map/unmap() interfaces. Check the mapping type + * of the domain before creating debugfs directory. + */ +void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev_pasid->dev); + char dir_name[10]; + + sprintf(dir_name, "%x", dev_pasid->pasid); + dev_pasid->debugfs_dentry = debugfs_create_dir(dir_name, info->debugfs_dentry); + + debugfs_create_file("domain_translation_struct", 0444, dev_pasid->debugfs_dentry, + dev_pasid, &pasid_domain_translation_struct_fops); +} + +/* Remove the device pasid debugfs directory. */ +void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) +{ + debugfs_remove_recursive(dev_pasid->debugfs_dentry); +} diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 48f1035601122cc65af65a6edac30cb8464c5fce..3af810b3fd67bc4aa37dd7f82539ad8eeb5916b7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -47,6 +47,9 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 +#define MAX_AGAW_WIDTH 64 +#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) + #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) @@ -61,6 +64,74 @@ #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) +/* page table handling */ +#define LEVEL_STRIDE (9) +#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) + +static inline int agaw_to_level(int agaw) +{ + return agaw + 2; +} + +static inline int agaw_to_width(int agaw) +{ + return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); +} + +static inline int width_to_agaw(int width) +{ + return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); +} + +static inline unsigned int level_to_offset_bits(int level) +{ + return (level - 1) * LEVEL_STRIDE; +} + +static inline int pfn_level_offset(u64 pfn, int level) +{ + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; +} + +static inline u64 level_mask(int level) +{ + return -1ULL << level_to_offset_bits(level); +} + +static inline u64 level_size(int level) +{ + return 1ULL << level_to_offset_bits(level); +} + +static inline u64 align_to_level(u64 pfn, int level) +{ + return (pfn + level_size(level) - 1) & level_mask(level); +} + +static inline unsigned long lvl_to_nr_pages(unsigned int lvl) +{ + return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); +} + +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things + are never going to work. */ +static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) +{ + return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) +{ + return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ + return mm_to_dma_pfn_start(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ + return page_to_dma_pfn(virt_to_page(p)); +} + static void __init check_tylersburg_isoch(void); static int rwbf_quirk; @@ -98,6 +169,78 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) return re->hi & VTD_PAGE_MASK; } +static inline void context_set_present(struct context_entry *context) +{ + context->lo |= 1; +} + +static inline void context_set_fault_enable(struct context_entry *context) +{ + context->lo &= (((u64)-1) << 2) | 1; +} + +static inline void context_set_translation_type(struct context_entry *context, + unsigned long value) +{ + context->lo &= (((u64)-1) << 4) | 3; + context->lo |= (value & 3) << 2; +} + +static inline void context_set_address_root(struct context_entry *context, + unsigned long value) +{ + context->lo &= ~VTD_PAGE_MASK; + context->lo |= value & VTD_PAGE_MASK; +} + +static inline void context_set_address_width(struct context_entry *context, + unsigned long value) +{ + context->hi |= value & 7; +} + +static inline void context_set_domain_id(struct context_entry *context, + unsigned long value) +{ + context->hi |= (value & ((1 << 16) - 1)) << 8; +} + +static inline void context_set_pasid(struct context_entry *context) +{ + context->lo |= CONTEXT_PASIDE; +} + +static inline int context_domain_id(struct context_entry *c) +{ + return((c->hi >> 8) & 0xffff); +} + +static inline void context_clear_entry(struct context_entry *context) +{ + context->lo = 0; + context->hi = 0; +} + +static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + if (!iommu->copied_tables) + return false; + + return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + set_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -140,7 +283,6 @@ static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) -static void device_block_translation(struct device *dev); static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); @@ -226,12 +368,29 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_type_is_si(struct dmar_domain *domain) +void *alloc_pgtable_page(int node, gfp_t gfp) +{ + struct page *page; + void *vaddr = NULL; + + page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); + if (page) + vaddr = page_address(page); + return vaddr; +} + +void free_pgtable_page(void *vaddr) +{ + free_page((unsigned long)vaddr); +} + +static inline int domain_type_is_si(struct dmar_domain *domain) { return domain->domain.type == IOMMU_DOMAIN_IDENTITY; } -static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) +static inline int domain_pfn_supported(struct dmar_domain *domain, + unsigned long pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; @@ -293,7 +452,7 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); } -static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) +static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) { return sm_supported(iommu) ? ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); @@ -402,7 +561,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) } /* Some capabilities may be different across iommus */ -static void domain_update_iommu_cap(struct dmar_domain *domain) +void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); @@ -545,7 +704,7 @@ static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) return false; } -static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) +struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) { struct dmar_drhd_unit *drhd = NULL; struct pci_dev *pdev = NULL; @@ -1428,8 +1587,9 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, } /* Notification for newly created mappings */ -static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, - unsigned long pfn, unsigned int pages) +static inline void __mapping_notify_one(struct intel_iommu *iommu, + struct dmar_domain *domain, + unsigned long pfn, unsigned int pages) { /* * It's a non-present to present mapping. Only flush if caching mode @@ -1631,8 +1791,7 @@ static struct dmar_domain *alloc_domain(unsigned int type) return domain; } -static int domain_attach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; unsigned long ndomains; @@ -1681,8 +1840,7 @@ static int domain_attach_iommu(struct dmar_domain *domain, return ret; } -static void domain_detach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; @@ -1698,7 +1856,7 @@ static void domain_detach_iommu(struct dmar_domain *domain, spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) +static inline int guestwidth_to_adjustwidth(int gaw) { int agaw; int r = (gaw - 12) % 9; @@ -1727,17 +1885,66 @@ static void domain_exit(struct dmar_domain *domain) kfree(domain); } +/* + * Get the PASID directory size for scalable mode context entry. + * Value of X in the PDTS field of a scalable mode context entry + * indicates PASID directory with 2^(X + 7) entries. + */ +static inline unsigned long context_get_sm_pds(struct pasid_table *table) +{ + unsigned long pds, max_pde; + + max_pde = table->max_pasid >> PASID_PDE_SHIFT; + pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); + if (pds < 7) + return 0; + + return pds - 7; +} + +/* + * Set the RID_PASID field of a scalable mode context entry. The + * IOMMU hardware will use the PASID value set in this field for + * DMA translations of DMA requests without PASID. + */ +static inline void +context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) +{ + context->hi |= pasid & ((1 << 20) - 1); +} + +/* + * Set the DTE(Device-TLB Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_dte(struct context_entry *context) +{ + context->lo |= BIT_ULL(2); +} + +/* + * Set the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_pre(struct context_entry *context) +{ + context->lo |= BIT_ULL(4); +} + +/* Convert value to context PASID directory size field coding. */ +#define context_pdts(pds) (((pds) & 0x7) << 9) + static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, + struct pasid_table *table, u8 bus, u8 devfn) { struct device_domain_info *info = domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; - struct dma_pte *pgd = domain->pgd; struct context_entry *context; - int agaw, ret; + int ret; if (hw_pass_through && domain_type_is_si(domain)) translation = CONTEXT_TT_PASS_THROUGH; @@ -1780,37 +1987,65 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } context_clear_entry(context); - context_set_domain_id(context, did); - if (translation != CONTEXT_TT_PASS_THROUGH) { - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } + if (sm_supported(iommu)) { + unsigned long pds; - if (info && info->ats_supported) - translation = CONTEXT_TT_DEV_IOTLB; - else - translation = CONTEXT_TT_MULTI_LEVEL; + /* Setup the PASID DIR pointer: */ + pds = context_get_sm_pds(table); + context->lo = (u64)virt_to_phys(table->table) | + context_pdts(pds); + + /* Setup the RID_PASID field: */ + context_set_sm_rid2pasid(context, IOMMU_NO_PASID); - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); - } else { /* - * In pass through mode, AW must be programmed to - * indicate the largest AGAW value supported by - * hardware. And ASR is ignored by hardware. + * Setup the Device-TLB enable bit and Page request + * Enable bit: */ - context_set_address_width(context, iommu->msagaw); + if (info && info->ats_supported) + context_set_sm_dte(context); + if (info && info->pri_supported) + context_set_sm_pre(context); + if (info && info->pasid_supported) + context_set_pasid(context); + } else { + struct dma_pte *pgd = domain->pgd; + int agaw; + + context_set_domain_id(context, did); + + if (translation != CONTEXT_TT_PASS_THROUGH) { + /* + * Skip top levels of page tables for iommu which has + * less agaw than default. Unnecessary for PT mode. + */ + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + ret = -ENOMEM; + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) + goto out_unlock; + } + + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, agaw); + } else { + /* + * In pass through mode, AW must be programmed to + * indicate the largest AGAW value supported by + * hardware. And ASR is ignored by hardware. + */ + context_set_address_width(context, iommu->msagaw); + } + + context_set_translation_type(context, translation); } - context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) @@ -1840,41 +2075,61 @@ static int domain_context_mapping_one(struct dmar_domain *domain, return ret; } +struct domain_context_mapping_data { + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct pasid_table *table; +}; + static int domain_context_mapping_cb(struct pci_dev *pdev, u16 alias, void *opaque) { - struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); - struct intel_iommu *iommu = info->iommu; - struct dmar_domain *domain = opaque; + struct domain_context_mapping_data *data = opaque; - return domain_context_mapping_one(domain, iommu, - PCI_BUS_NUM(alias), alias & 0xff); + return domain_context_mapping_one(data->domain, data->iommu, + data->table, PCI_BUS_NUM(alias), + alias & 0xff); } static int domain_context_mapping(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; + struct domain_context_mapping_data data; + struct pasid_table *table; + struct intel_iommu *iommu; + u8 bus, devfn; + + iommu = device_to_iommu(dev, &bus, &devfn); + if (!iommu) + return -ENODEV; + + table = intel_pasid_get_table(dev); if (!dev_is_pci(dev)) - return domain_context_mapping_one(domain, iommu, bus, devfn); + return domain_context_mapping_one(domain, iommu, table, + bus, devfn); + + data.domain = domain; + data.iommu = iommu; + data.table = table; return pci_for_each_dma_alias(to_pci_dev(dev), - domain_context_mapping_cb, domain); + &domain_context_mapping_cb, &data); } /* Returns a number of VTD pages, but aligned to MM page size */ -static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) +static inline unsigned long aligned_nrpages(unsigned long host_addr, + size_t size) { host_addr &= ~PAGE_MASK; return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } /* Return largest possible superpage level for a given mapping */ -static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phy_pfn, unsigned long pages) +static inline int hardware_largepage_caps(struct dmar_domain *domain, + unsigned long iov_pfn, + unsigned long phy_pfn, + unsigned long pages) { int support, level = 1; unsigned long pfnmerge; @@ -1952,6 +2207,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; + if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { + pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain->use_first_level) { @@ -2044,6 +2304,9 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 struct context_entry *context; u16 did_old; + if (!iommu) + return; + spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); if (!context) { @@ -2051,7 +2314,14 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } - did_old = context_domain_id(context); + if (sm_supported(iommu)) { + if (hw_pass_through && domain_type_is_si(info->domain)) + did_old = FLPT_DEFAULT_DID; + else + did_old = domain_id_iommu(info->domain, iommu); + } else { + did_old = context_domain_id(context); + } context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); @@ -2062,6 +2332,9 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); + if (sm_supported(iommu)) + qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); + iommu->flush.flush_iotlb(iommu, did_old, 0, @@ -2189,10 +2462,15 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; + struct intel_iommu *iommu; unsigned long flags; + u8 bus, devfn; int ret; + iommu = device_to_iommu(dev, &bus, &devfn); + if (!iommu) + return -ENODEV; + ret = domain_attach_iommu(domain, iommu); if (ret) return ret; @@ -2201,19 +2479,28 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, list_add(&info->link, &domain->devices); spin_unlock_irqrestore(&domain->lock, flags); - if (dev_is_real_dma_subdevice(dev)) - return 0; - - if (!sm_supported(iommu)) - ret = domain_context_mapping(domain, dev); - else if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); - else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); - else - ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); + /* PASID table is mandatory for a PCI device in scalable mode. */ + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + /* Setup the PASID entry for requests without PASID: */ + if (hw_pass_through && domain_type_is_si(domain)) + ret = intel_pasid_setup_pass_through(iommu, domain, + dev, IOMMU_NO_PASID); + else if (domain->use_first_level) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID); + else + ret = intel_pasid_setup_second_level(iommu, domain, + dev, IOMMU_NO_PASID); + if (ret) { + dev_err(dev, "Setup RID2PASID failed\n"); + device_block_translation(dev); + return ret; + } + } + ret = domain_context_mapping(domain, dev); if (ret) { + dev_err(dev, "Domain context map failed\n"); device_block_translation(dev); return ret; } @@ -3359,7 +3646,7 @@ void intel_iommu_shutdown(void) up_write(&dmar_global_lock); } -static struct intel_iommu *dev_to_intel_iommu(struct device *dev) +static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) { struct iommu_device *iommu_dev = dev_to_iommu_device(dev); @@ -3438,7 +3725,7 @@ const struct attribute_group *intel_iommu_groups[] = { NULL, }; -static bool has_external_pci(void) +static inline bool has_external_pci(void) { struct pci_dev *pdev = NULL; @@ -3492,7 +3779,7 @@ static inline int acpi_rmrr_andd_probe(struct device *dev) ret = iommu_probe_device(dev); - iommu = device_lookup_iommu(dev, &bus, &devfn); + iommu = device_to_iommu(dev, &bus, &devfn); if (!iommu) { pr_info("dpoint-- cannot get acpi device corresponding iommu\n"); return -EINVAL; @@ -3722,12 +4009,36 @@ static void domain_context_clear(struct device_domain_info *info) &domain_context_clear_one_cb, info); } +static void dmar_remove_one_dev_info(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *domain = info->domain; + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + if (!dev_is_real_dma_subdevice(info->dev)) { + if (dev_is_pci(info->dev) && sm_supported(iommu)) + intel_pasid_tear_down_entry(iommu, info->dev, + IOMMU_NO_PASID, false); + + iommu_disable_pci_caps(info); + domain_context_clear(info); + } + + spin_lock_irqsave(&domain->lock, flags); + list_del(&info->link); + spin_unlock_irqrestore(&domain->lock, flags); + + domain_detach_iommu(domain, iommu); + info->domain = NULL; +} + /* * Clear the page table pointer in context or pasid table entries so that * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures. */ -static void device_block_translation(struct device *dev) +void device_block_translation(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -3826,34 +4137,43 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) } static struct iommu_domain * -intel_iommu_domain_alloc_user(struct device *dev, u32 flags) +intel_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct iommu_domain *domain; + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; struct intel_iommu *iommu = info->iommu; - bool dirty_tracking; + struct iommu_domain *domain; + + /* Must be NESTING domain */ + if (parent) { + if (!nested_supported(iommu) || flags) + return ERR_PTR(-EOPNOTSUPP); + return intel_nested_domain_alloc(parent, user_data); + } if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) return ERR_PTR(-EOPNOTSUPP); - - if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !nested_supported(iommu)) + if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); - - dirty_tracking = (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING); - if (dirty_tracking && !ssads_supported(iommu)) + if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); /* - * domain_alloc_user op needs to fully initialize a domain - * before return, so uses iommu_domain_alloc() here for - * simple. + * domain_alloc_user op needs to fully initialize a domain before + * return, so uses iommu_domain_alloc() here for simple. */ domain = iommu_domain_alloc(dev->bus); if (!domain) - domain = ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); - if (!IS_ERR(domain) && dirty_tracking) { + if (nested_parent) + to_dmar_domain(domain)->nested_parent = true; + + if (dirty_tracking) { if (to_dmar_domain(domain)->use_first_level) { iommu_domain_free(domain); return ERR_PTR(-EOPNOTSUPP); @@ -3870,14 +4190,17 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) domain_exit(to_dmar_domain(domain)); } -static int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev) { - struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu = info->iommu; + struct intel_iommu *iommu; int addr_width; + iommu = device_to_iommu(dev, NULL, NULL); + if (!iommu) + return -ENODEV; + if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; @@ -3907,10 +4230,6 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, dmar_domain->agaw--; } - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && - context_copied(iommu, info->bus, info->devfn)) - return intel_pasid_setup_sm_context(dev); - return 0; } @@ -4158,7 +4477,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) u8 bus, devfn; int ret; - iommu = device_lookup_iommu(dev, &bus, &devfn); + iommu = device_to_iommu(dev, &bus, &devfn); if (!iommu || !iommu->iommu.ops) return ERR_PTR(-ENODEV); @@ -4219,29 +4538,20 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) kfree(info); return ERR_PTR(ret); } - - if (!context_copied(iommu, info->bus, info->devfn)) { - ret = intel_pasid_setup_sm_context(dev); - if (ret) { - intel_pasid_free_table(dev); - return ERR_PTR(ret); - } - } } + intel_iommu_debugfs_create_dev(info); + return &iommu->iommu; } static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && - !context_copied(iommu, info->bus, info->devfn)) - intel_pasid_teardown_sm_context(dev); + dmar_remove_one_dev_info(dev); intel_pasid_free_table(dev); + intel_iommu_debugfs_remove_dev(info); kfree(info); set_dma_ops(dev, NULL); } @@ -4488,8 +4798,8 @@ static bool risky_device(struct pci_dev *pdev) return false; } -static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); unsigned long pages = aligned_nrpages(iova, size); @@ -4499,13 +4809,13 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, xa_for_each(&dmar_domain->iommu_array, i, info) __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); + return 0; } static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) { - struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); struct dev_pasid_info *curr, *dev_pasid = NULL; - struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; unsigned long flags; @@ -4537,6 +4847,7 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) spin_unlock_irqrestore(&dmar_domain->lock, flags); domain_detach_iommu(dmar_domain, iommu); + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); out_tear_down: intel_pasid_tear_down_entry(iommu, dev, pasid, false); @@ -4575,7 +4886,8 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, goto out_free; if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, pasid); + ret = intel_pasid_setup_pass_through(iommu, dmar_domain, + dev, pasid); else if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid); @@ -4591,6 +4903,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); spin_unlock_irqrestore(&dmar_domain->lock, flags); + if (domain->type & __IOMMU_DOMAIN_PAGING) + intel_iommu_debugfs_create_dev_pasid(dev_pasid); + return 0; out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); @@ -4609,6 +4924,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) if (!vtd) return ERR_PTR(-ENOMEM); + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); @@ -4695,7 +5011,6 @@ static const struct iommu_dirty_ops intel_dirty_ops = { const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, - .release_domain = &blocking_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f63a29d9edd5ec716abea0c156e5fe9f35bdb4c7..4ec4be87f384a2f94430f38409a7209015c5e791 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -25,6 +25,7 @@ #include #include +#include /* * VT-d hardware uses 4KiB page size regardless of host page size. @@ -139,6 +140,9 @@ #define DMAR_ECEO_REG 0x408 #define DMAR_ECRSP_REG 0x410 #define DMAR_ECCAP_REG 0x430 +#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ +#define DMAR_VCMD_REG 0xe00 /* Virtual command register */ +#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ #define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) #define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) @@ -600,20 +604,44 @@ struct dmar_domain { * iommu_map() interface. */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ + u8 nested_parent:1; /* Has other domains nested on it */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ - struct dma_pte *pgd; /* virtual address */ - int gaw; /* max guest address width */ - - /* adjusted guest address width, 0 is level 2 30-bit */ - int agaw; int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ - u64 max_addr; /* maximum mapped address */ + union { + /* DMA remapping domain */ + struct { + /* virtual address */ + struct dma_pte *pgd; + /* max guest address width */ + int gaw; + /* + * adjusted guest address width: + * 0: level 2 30-bit + * 1: level 3 39-bit + * 2: level 4 48-bit + * 3: level 5 57-bit + */ + int agaw; + /* maximum mapped address */ + u64 max_addr; + }; + + /* Nested user domain */ + struct { + /* parent page table which the user domain is nested on */ + struct dmar_domain *s2_domain; + /* user page table pointer (in GPA) */ + unsigned long s1_pgtbl; + /* page table attributes */ + struct iommu_hwpt_vtd_s1 s1_cfg; + }; + }; struct iommu_domain domain; /* generic domain data structure for iommu core */ @@ -724,12 +752,18 @@ struct device_domain_info { struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ struct pasid_table *pasid_table; /* pasid table */ +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS + struct dentry *debugfs_dentry; /* pointer to device directory dentry */ +#endif }; struct dev_pasid_info { struct list_head link_domain; /* link to domain siblings */ struct device *dev; ioasid_t pasid; +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS + struct dentry *debugfs_dentry; /* pointer to pasid directory dentry */ +#endif }; static inline void __iommu_flush_cache( @@ -820,181 +854,6 @@ static inline bool context_present(struct context_entry *context) return (context->lo & 1); } -#define LEVEL_STRIDE (9) -#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) -#define MAX_AGAW_WIDTH (64) -#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) - -static inline int agaw_to_level(int agaw) -{ - return agaw + 2; -} - -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - -static inline int width_to_agaw(int width) -{ - return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); -} - -static inline unsigned int level_to_offset_bits(int level) -{ - return (level - 1) * LEVEL_STRIDE; -} - -static inline int pfn_level_offset(u64 pfn, int level) -{ - return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; -} - -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} - -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} - -static inline void context_set_present(struct context_entry *context) -{ - context->lo |= 1; -} - -static inline void context_set_fault_enable(struct context_entry *context) -{ - context->lo &= (((u64)-1) << 2) | 1; -} - -static inline void context_set_translation_type(struct context_entry *context, - unsigned long value) -{ - context->lo &= (((u64)-1) << 4) | 3; - context->lo |= (value & 3) << 2; -} - -static inline void context_set_address_root(struct context_entry *context, - unsigned long value) -{ - context->lo &= ~VTD_PAGE_MASK; - context->lo |= value & VTD_PAGE_MASK; -} - -static inline void context_set_address_width(struct context_entry *context, - unsigned long value) -{ - context->hi |= value & 7; -} - -static inline void context_set_domain_id(struct context_entry *context, - unsigned long value) -{ - context->hi |= (value & ((1 << 16) - 1)) << 8; -} - -static inline void context_set_pasid(struct context_entry *context) -{ - context->lo |= CONTEXT_PASIDE; -} - -static inline int context_domain_id(struct context_entry *c) -{ - return((c->hi >> 8) & 0xffff); -} - -static inline void context_clear_entry(struct context_entry *context) -{ - context->lo = 0; - context->hi = 0; -} - -#ifdef CONFIG_INTEL_IOMMU -static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - if (!iommu->copied_tables) - return false; - - return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - -static inline void -set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - set_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - -static inline void -clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} -#endif /* CONFIG_INTEL_IOMMU */ - -/* - * Set the RID_PASID field of a scalable mode context entry. The - * IOMMU hardware will use the PASID value set in this field for - * DMA translations of DMA requests without PASID. - */ -static inline void -context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) -{ - context->hi |= pasid & ((1 << 20) - 1); -} - -/* - * Set the DTE(Device-TLB Enable) field of a scalable mode context - * entry. - */ -static inline void context_set_sm_dte(struct context_entry *context) -{ - context->lo |= BIT_ULL(2); -} - -/* - * Set the PRE(Page Request Enable) field of a scalable mode context - * entry. - */ -static inline void context_set_sm_pre(struct context_entry *context) -{ - context->lo |= BIT_ULL(4); -} - -/* Convert value to context PASID directory size field coding. */ -#define context_pdts(pds) (((pds) & 0x7) << 9) - struct dmar_drhd_unit *dmar_find_matched_drhd_unit(struct pci_dev *dev); int dmar_enable_qi(struct intel_iommu *iommu); @@ -1029,9 +888,19 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, */ #define QI_OPT_WAIT_DRAIN BIT(0) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void device_block_translation(struct device *dev); +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev); +void domain_update_iommu_cap(struct dmar_domain *domain); + int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); +struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); @@ -1073,8 +942,16 @@ static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid #ifdef CONFIG_INTEL_IOMMU_DEBUGFS void intel_iommu_debugfs_init(void); +void intel_iommu_debugfs_create_dev(struct device_domain_info *info); +void intel_iommu_debugfs_remove_dev(struct device_domain_info *info); +void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid); +void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid); #else static inline void intel_iommu_debugfs_init(void) {} +static inline void intel_iommu_debugfs_create_dev(struct device_domain_info *info) {} +static inline void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) {} +static inline void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) {} +static inline void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) {} #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ extern const struct attribute_group *intel_iommu_groups[]; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c new file mode 100644 index 0000000000000000000000000000000000000000..b5a5563ab32c6bbb3a2e780bb010875b95dd29f4 --- /dev/null +++ b/drivers/iommu/intel/nested.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nested.c - nested mode translation support + * + * Copyright (C) 2023 Intel Corporation + * + * Author: Lu Baolu + * Jacob Pan + * Yi Liu + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include +#include +#include + +#include "iommu.h" +#include "pasid.h" + +static int intel_nested_attach_dev(struct iommu_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret = 0; + + if (info->domain) + device_block_translation(dev); + + if (iommu->agaw < dmar_domain->s2_domain->agaw) { + dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); + return -ENODEV; + } + + /* + * Stage-1 domain cannot work alone, it is nested on a s2_domain. + * The s2_domain will be used in nested translation, hence needs + * to ensure the s2_domain is compatible with this IOMMU. + */ + ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + if (ret) { + dev_err_ratelimited(dev, "s2 domain is not compatible\n"); + return ret; + } + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) { + dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); + return ret; + } + + ret = intel_pasid_setup_nested(iommu, dev, + IOMMU_NO_PASID, dmar_domain); + if (ret) { + domain_detach_iommu(dmar_domain, iommu); + dev_err_ratelimited(dev, "Failed to setup pasid entry\n"); + return ret; + } + + info->domain = dmar_domain; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&info->link, &dmar_domain->devices); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return 0; +} + +static void intel_nested_domain_free(struct iommu_domain *domain) +{ + kfree(to_dmar_domain(domain)); +} + +static const struct iommu_domain_ops intel_nested_domain_ops = { + .attach_dev = intel_nested_attach_dev, + .free = intel_nested_domain_free, +}; + +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct iommu_hwpt_vtd_s1 vtd; + struct dmar_domain *domain; + int ret; + + /* Must be nested domain */ + if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) + return ERR_PTR(-EOPNOTSUPP); + if (parent->ops != intel_iommu_ops.default_domain_ops || + !s2_domain->nested_parent) + return ERR_PTR(-EINVAL); + + ret = iommu_copy_struct_from_user(&vtd, user_data, + IOMMU_HWPT_DATA_VTD_S1, __reserved); + if (ret) + return ERR_PTR(ret); + + domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->use_first_level = true; + domain->s2_domain = s2_domain; + domain->s1_pgtbl = vtd.pgtbl_addr; + domain->s1_cfg = vtd; + domain->domain.ops = &intel_nested_domain_ops; + domain->domain.type = IOMMU_DOMAIN_NESTED; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + spin_lock_init(&domain->lock); + xa_init(&domain->iommu_array); + + return &domain->domain; +} diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index f09e89c0928bdf2160d5233ffa44a520f48578da..9fb9b200112c5d95e0d4eace63ba8540f52e1f23 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -27,6 +27,63 @@ */ u32 intel_pasid_max_id = PASID_MAX; +int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid) +{ + unsigned long flags; + u8 status_code; + int ret = 0; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + *pasid = VCMD_VRSP_RESULT_PASID(res); + break; + case VCMD_VRSP_SC_NO_PASID_AVAIL: + pr_info("IOMMU: %s: No PASID available\n", iommu->name); + ret = -ENOSPC; + break; + default: + ret = -ENODEV; + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } + + return ret; +} + +void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid) +{ + unsigned long flags; + u8 status_code; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, + VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + break; + case VCMD_VRSP_SC_INVALID_PASID: + pr_info("IOMMU: %s: Invalid PASID\n", iommu->name); + break; + default: + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } +} + /* * Per device pasid table management: */ @@ -173,6 +230,30 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) /* * Interfaces for PASID table entry manipulation: */ +static inline void pasid_clear_entry(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], 0); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], PASID_PTE_FPD); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + static void intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) { @@ -188,6 +269,192 @@ intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) pasid_clear_entry(pe); } +static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) +{ + u64 old; + + old = READ_ONCE(*ptr); + WRITE_ONCE(*ptr, (old & ~mask) | bits); +} + +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + +/* + * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode + * PASID entry. + */ +static inline void +pasid_set_domain_id(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); +} + +/* + * Get domain ID value of a scalable mode PASID entry. + */ +static inline u16 +pasid_get_domain_id(struct pasid_entry *pe) +{ + return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); +} + +/* + * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_slptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); +} + +/* + * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID + * entry. + */ +static inline void +pasid_set_address_width(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); +} + +/* + * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_translation_type(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); +} + +/* + * Enable fault processing by clearing the FPD(Fault Processing + * Disable) field (Bit 1) of a scalable mode PASID entry. + */ +static inline void pasid_set_fault_enable(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 1, 0); +} + +/* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* + * Setup the WPE(Write Protect Enable) field (Bit 132) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_wpe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); +} + +/* + * Setup the P(Present) field (Bit 0) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_present(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 0, 1); +} + +/* + * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) +{ + pasid_set_bits(&pe->val[1], 1 << 23, value << 23); +} + +/* + * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID + * entry. It is required when XD bit of the first level page table + * entry is about to be set. + */ +static inline void pasid_set_nxe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); +} + +/* + * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode + * PASID entry. + */ +static inline void +pasid_set_pgsnp(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); +} + +/* + * Setup the First Level Page table Pointer field (Bit 140~191) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_flptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); +} + +/* + * Setup the First Level Paging Mode field (Bit 130~131) of a + * scalable mode PASID entry. + */ +static inline void +pasid_set_flpm(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); +} + +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -349,9 +616,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, * Skip top levels of page tables for iommu which has less agaw * than default. Unnecessary for PT mode. */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) +static inline int iommu_skip_agaw(struct dmar_domain *domain, + struct intel_iommu *iommu, + struct dma_pte **pgd) { int agaw; @@ -503,6 +770,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, * Set up the scalable mode pasid entry for passthrough translation type. */ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, + struct dmar_domain *domain, struct device *dev, u32 pasid) { u16 did = FLPT_DEFAULT_DID; @@ -576,204 +844,96 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, devtlb_invalidation_with_pasid(iommu, dev, pasid); } -/* - * Interfaces to setup or teardown a pasid table to the scalable-mode - * context table entry: +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @pasid: PASID to be programmed in the device PASID table + * @domain: User stage-1 domain nested on a stage-2 domain + * + * This is used for nested translation. The input domain should be + * nested type and nested on a parent with 'is_nested_parent' flag + * set. */ - -static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - struct context_entry *context; + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct dma_pte *pgd = s2_domain->pgd; + struct pasid_entry *pte; - spin_lock(&iommu->lock); - context = iommu_context_addr(iommu, bus, devfn, false); - if (!context) { - spin_unlock(&iommu->lock); - return; + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; } - context_clear_entry(context); - __iommu_flush_cache(iommu, context, sizeof(*context)); - spin_unlock(&iommu->lock); - - /* - * Cache invalidation for changes to a scalable-mode context table - * entry. - * - * Section 6.5.3.3 of the VT-d spec: - * - Device-selective context-cache invalidation; - * - Domain-selective PASID-cache invalidation to affected domains - * (can be skipped if all PASID entries were not-present); - * - Domain-selective IOTLB invalidation to affected domains; - * - Global Device-TLB invalidation to affected functions. - * - * The iommu has been parked in the blocking state. All domains have - * been detached from the device or PASID. The PASID and IOTLB caches - * have been invalidated during the domain detach path. - */ - iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn), - DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); - devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); -} - -static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) -{ - struct device *dev = data; - - if (dev == &pdev->dev) - device_pasid_table_teardown(dev, PCI_BUS_NUM(alias), alias & 0xff); - - return 0; -} - -void intel_pasid_teardown_sm_context(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - - if (!dev_is_pci(dev)) { - device_pasid_table_teardown(dev, info->bus, info->devfn); - return; + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; } - pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_teardown, dev); -} - -/* - * Get the PASID directory size for scalable mode context entry. - * Value of X in the PDTS field of a scalable mode context entry - * indicates PASID directory with 2^(X + 7) entries. - */ -static unsigned long context_get_sm_pds(struct pasid_table *table) -{ - unsigned long pds, max_pde; - - max_pde = table->max_pasid >> PASID_PDE_SHIFT; - pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); - if (pds < 7) - return 0; - - return pds - 7; -} - -static int context_entry_set_pasid_table(struct context_entry *context, - struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct pasid_table *table = info->pasid_table; - struct intel_iommu *iommu = info->iommu; - unsigned long pds; - - context_clear_entry(context); - - pds = context_get_sm_pds(table); - context->lo = (u64)virt_to_phys(table->table) | context_pdts(pds); - context_set_sm_rid2pasid(context, IOMMU_NO_PASID); - - if (info->ats_supported) - context_set_sm_dte(context); - if (info->pri_supported) - context_set_sm_pre(context); - if (info->pasid_supported) - context_set_pasid(context); - - context_set_fault_enable(context); - context_set_present(context); - __iommu_flush_cache(iommu, context, sizeof(*context)); - - return 0; -} - -static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - struct context_entry *context; + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } spin_lock(&iommu->lock); - context = iommu_context_addr(iommu, bus, devfn, true); - if (!context) { + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { spin_unlock(&iommu->lock); - return -ENOMEM; + return -ENODEV; } - - if (context_present(context) && !context_copied(iommu, bus, devfn)) { + if (pasid_pte_is_present(pte)) { spin_unlock(&iommu->lock); - return 0; + return -EBUSY; } - if (context_copied(iommu, bus, devfn)) { - context_clear_entry(context); - __iommu_flush_cache(iommu, context, sizeof(*context)); - - /* - * For kdump cases, old valid entries may be cached due to - * the in-flight DMA and copied pgtable, but there is no - * unmapping behaviour for them, thus we need explicit cache - * flushes for all affected domain IDs and PASIDs used in - * the copied PASID table. Given that we have no idea about - * which domain IDs and PASIDs were used in the copied tables, - * upgrade them to global PASID and IOTLB cache invalidation. - */ - iommu->flush.flush_context(iommu, 0, - PCI_DEVID(bus, devfn), - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); - devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); + pasid_clear_entry(pte); - /* - * At this point, the device is supposed to finish reset at - * its driver probe stage, so no in-flight DMA will exist, - * and we don't need to worry anymore hereafter. - */ - clear_context_copied(iommu, bus, devfn); - } + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); - context_entry_set_pasid_table(context, dev); - spin_unlock(&iommu->lock); + pasid_set_flptr(pte, (uintptr_t)s1_gpgd); - /* - * It's a non-present to present mapping. If hardware doesn't cache - * non-present entry we don't need to flush the caches. If it does - * cache non-present entries, then it does so in the special - * domain #0, which we have to flush: - */ - if (cap_caching_mode(iommu->cap)) { - iommu->flush.flush_context(iommu, 0, - PCI_DEVID(bus, devfn), - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH); + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); } - return 0; -} - -static int pci_pasid_table_setup(struct pci_dev *pdev, u16 alias, void *data) -{ - struct device *dev = data; + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); - if (dev != &pdev->dev) - return 0; - - return device_pasid_table_setup(dev, PCI_BUS_NUM(alias), alias & 0xff); -} + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); -/* - * Set the device's PASID table to its context table entry. - * - * The PASID table is set to the context entries of both device itself - * and its alias requester ID for DMA. - */ -int intel_pasid_setup_sm_context(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + spin_unlock(&iommu->lock); - if (!dev_is_pci(dev)) - return device_pasid_table_setup(dev, info->bus, info->devfn); + pasid_flush_caches(iommu, pte, pasid, did); - return pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_setup, dev); + return 0; } diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 7cc889d6cba97f3abf98846b991a31c61abf6367..dd37611175cc1b9e4009aad7d0c09522147128eb 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,6 +22,16 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) +/* Virtual command interface for enlightened pasid management. */ +#define VCMD_CMD_ALLOC 0x1 +#define VCMD_CMD_FREE 0x2 +#define VCMD_VRSP_IP 0x1 +#define VCMD_VRSP_SC(e) (((e) & 0xff) >> 1) +#define VCMD_VRSP_SC_SUCCESS 0 +#define VCMD_VRSP_SC_NO_PASID_AVAIL 16 +#define VCMD_VRSP_SC_INVALID_PASID 16 +#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 16) & 0xfffff) +#define VCMD_CMD_OPERAND(e) ((e) << 16) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. @@ -86,216 +96,6 @@ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); } -static inline void pasid_clear_entry(struct pasid_entry *pe) -{ - WRITE_ONCE(pe->val[0], 0); - WRITE_ONCE(pe->val[1], 0); - WRITE_ONCE(pe->val[2], 0); - WRITE_ONCE(pe->val[3], 0); - WRITE_ONCE(pe->val[4], 0); - WRITE_ONCE(pe->val[5], 0); - WRITE_ONCE(pe->val[6], 0); - WRITE_ONCE(pe->val[7], 0); -} - -static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) -{ - WRITE_ONCE(pe->val[0], PASID_PTE_FPD); - WRITE_ONCE(pe->val[1], 0); - WRITE_ONCE(pe->val[2], 0); - WRITE_ONCE(pe->val[3], 0); - WRITE_ONCE(pe->val[4], 0); - WRITE_ONCE(pe->val[5], 0); - WRITE_ONCE(pe->val[6], 0); - WRITE_ONCE(pe->val[7], 0); -} - -static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) -{ - u64 old; - - old = READ_ONCE(*ptr); - WRITE_ONCE(*ptr, (old & ~mask) | bits); -} - -static inline u64 pasid_get_bits(u64 *ptr) -{ - return READ_ONCE(*ptr); -} - -/* - * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode - * PASID entry. - */ -static inline void -pasid_set_domain_id(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); -} - -/* - * Get domain ID value of a scalable mode PASID entry. - */ -static inline u16 -pasid_get_domain_id(struct pasid_entry *pe) -{ - return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); -} - -/* - * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_slptr(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); -} - -/* - * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID - * entry. - */ -static inline void -pasid_set_address_width(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); -} - -/* - * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_translation_type(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); -} - -/* - * Enable fault processing by clearing the FPD(Fault Processing - * Disable) field (Bit 1) of a scalable mode PASID entry. - */ -static inline void pasid_set_fault_enable(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 1, 0); -} - -/* - * Enable second level A/D bits by setting the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_ssade(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); -} - -/* - * Disable second level A/D bits by clearing the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry. - */ -static inline void pasid_clear_ssade(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 9, 0); -} - -/* - * Checks if second level A/D bits specifically the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry is set. - */ -static inline bool pasid_get_ssade(struct pasid_entry *pe) -{ - return pasid_get_bits(&pe->val[0]) & (1 << 9); -} - -/* - * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a - * scalable mode PASID entry. - */ -static inline void pasid_set_sre(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 0, 1); -} - -/* - * Setup the WPE(Write Protect Enable) field (Bit 132) of a - * scalable mode PASID entry. - */ -static inline void pasid_set_wpe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); -} - -/* - * Setup the P(Present) field (Bit 0) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_present(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 0, 1); -} - -/* - * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) -{ - pasid_set_bits(&pe->val[1], 1 << 23, value << 23); -} - -/* - * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID - * entry. It is required when XD bit of the first level page table - * entry is about to be set. - */ -static inline void pasid_set_nxe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); -} - -/* - * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode - * PASID entry. - */ -static inline void -pasid_set_pgsnp(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); -} - -/* - * Setup the First Level Page table Pointer field (Bit 140~191) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_flptr(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); -} - -/* - * Setup the First Level Paging Mode field (Bit 130~131) of a - * scalable mode PASID entry. - */ -static inline void -pasid_set_flpm(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); -} - -/* - * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) - * of a scalable mode PASID entry. - */ -static inline void pasid_set_eafe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); -} - extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); @@ -311,12 +111,15 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, + struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); +int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid); +void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid); void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, struct device *dev, u32 pasid); -int intel_pasid_setup_sm_context(struct device *dev); -void intel_pasid_teardown_sm_context(struct device *dev); #endif /* __INTEL_PASID_H */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 8a56b301efa0464053faa06ad3999145ff6f666d..4072b43f28c8136b3394e84a01662a5820fbae1c 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -391,9 +391,14 @@ static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, void intel_svm_remove_dev_pasid(struct device *dev, u32 pasid) { struct intel_svm_dev *sdev; + struct intel_iommu *iommu; struct intel_svm *svm; struct mm_struct *mm; + iommu = device_to_iommu(dev, NULL, NULL); + if (!iommu) + return; + if (pasid_to_svm_sdev(dev, pasid, &svm, &sdev)) return; mm = svm->mm; @@ -744,16 +749,25 @@ int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; struct iommu_fault_page_request *prm; + struct intel_iommu *iommu; bool private_present; bool pasid_present; bool last_page; + u8 bus, devfn; int ret = 0; u16 sid; + if (!dev || !dev_is_pci(dev)) + return -ENODEV; + + iommu = device_to_iommu(dev, &bus, &devfn); + if (!iommu) + return -ENODEV; + + if (!msg || !evt) + return -EINVAL; + prm = &evt->fault.prm; sid = PCI_DEVID(bus, devfn); pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index bc758ab70f4940a48b7e186b2b898635bf23a0e4..5ef6cae0c2b5402d6297f86084b850f375face3d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -200,20 +200,28 @@ static dma_addr_t __arm_lpae_dma_addr(void *pages) } static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, - struct io_pgtable_cfg *cfg) + struct io_pgtable_cfg *cfg, + void *cookie) { struct device *dev = cfg->iommu_dev; int order = get_order(size); - struct page *p; dma_addr_t dma; void *pages; VM_BUG_ON((gfp & __GFP_HIGHMEM)); - p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order); - if (!p) + + if (cfg->alloc) { + pages = cfg->alloc(cookie, size, gfp); + } else { + struct page *p; + + p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order); + pages = p ? page_address(p) : NULL; + } + + if (!pages) return NULL; - pages = page_address(p); if (!cfg->coherent_walk) { dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) @@ -232,18 +240,28 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, out_unmap: dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n"); dma_unmap_single(dev, dma, size, DMA_TO_DEVICE); + out_free: - __free_pages(p, order); + if (cfg->free) + cfg->free(cookie, pages, size); + else + free_pages((unsigned long)pages, order); + return NULL; } static void __arm_lpae_free_pages(void *pages, size_t size, - struct io_pgtable_cfg *cfg) + struct io_pgtable_cfg *cfg, + void *cookie) { if (!cfg->coherent_walk) dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages), size, DMA_TO_DEVICE); - free_pages((unsigned long)pages, get_order(size)); + + if (cfg->free) + cfg->free(cookie, pages, size); + else + free_pages((unsigned long)pages, get_order(size)); } static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, @@ -385,13 +403,13 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, /* Grab a pointer to the next level */ pte = READ_ONCE(*ptep); if (!pte) { - cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg); + cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg, data->iop.cookie); if (!cptep) return -ENOMEM; pte = arm_lpae_install_table(cptep, ptep, 0, data); if (pte) - __arm_lpae_free_pages(cptep, tblsz, cfg); + __arm_lpae_free_pages(cptep, tblsz, cfg, data->iop.cookie); } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) { __arm_lpae_sync_pte(ptep, 1, cfg); } @@ -535,7 +553,7 @@ static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl, __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); } - __arm_lpae_free_pages(start, table_size, &data->iop.cfg); + __arm_lpae_free_pages(start, table_size, &data->iop.cfg, data->iop.cookie); } static void arm_lpae_free_pgtable(struct io_pgtable *iop) @@ -563,7 +581,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) return 0; - tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg); + tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie); if (!tablep) return 0; /* Bytes unmapped */ @@ -586,7 +604,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); if (pte != blk_pte) { - __arm_lpae_free_pages(tablep, tablesz, cfg); + __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie); /* * We may race against someone unmapping another part of this * block, but anything else is invalid. We can't misinterpret @@ -893,7 +911,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) /* Looking good; allocate a pgd */ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), - GFP_KERNEL, cfg); + GFP_KERNEL, cfg, cookie); if (!data->pgd) goto out_free_data; @@ -995,7 +1013,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) /* Allocate pgd pages */ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), - GFP_KERNEL, cfg); + GFP_KERNEL, cfg, cookie); if (!data->pgd) goto out_free_data; @@ -1070,7 +1088,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)); data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL, - cfg); + cfg, cookie); if (!data->pgd) goto out_free_data; @@ -1091,26 +1109,31 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) } struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_64_lpae_alloc_pgtable_s1, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_64_lpae_alloc_pgtable_s2, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_32_lpae_alloc_pgtable_s1, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_32_lpae_alloc_pgtable_s2, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_mali_lpae_alloc_pgtable, .free = arm_lpae_free_pgtable, }; diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index b843fcd365d286668273667401b331a27fc04d23..8841c1487f00481f92759e499ad85a70423b7428 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -34,6 +34,26 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #endif }; +static int check_custom_allocator(enum io_pgtable_fmt fmt, + struct io_pgtable_cfg *cfg) +{ + /* No custom allocator, no need to check the format. */ + if (!cfg->alloc && !cfg->free) + return 0; + + /* When passing a custom allocator, both the alloc and free + * functions should be provided. + */ + if (!cfg->alloc || !cfg->free) + return -EINVAL; + + /* Make sure the format supports custom allocators. */ + if (io_pgtable_init_table[fmt]->caps & IO_PGTABLE_CAP_CUSTOM_ALLOCATOR) + return 0; + + return -EINVAL; +} + struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt, struct io_pgtable_cfg *cfg, void *cookie) @@ -44,6 +64,9 @@ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt, if (fmt >= IO_PGTABLE_NUM_FMTS) return NULL; + if (check_custom_allocator(fmt, cfg)) + return NULL; + fns = io_pgtable_init_table[fmt]; if (!fns) return NULL; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 60c7b6c530d7fed246da14d94f361a1d21868c48..9db6b81d2caa114729caf30df8a38abe2224f8fa 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -37,7 +37,6 @@ #include "iommu-priv.h" #include "iommu-sva.h" -#include "iommu-priv.h" static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -149,7 +148,7 @@ struct iommu_group_attribute iommu_group_attr_##_name = \ static LIST_HEAD(iommu_device_list); static DEFINE_SPINLOCK(iommu_device_lock); -static struct bus_type * const iommu_buses[] = { +static const struct bus_type * const iommu_buses[] = { &platform_bus_type, #ifdef CONFIG_PCI &pci_bus_type, @@ -258,13 +257,6 @@ int iommu_device_register(struct iommu_device *iommu, /* We need to be able to take module references appropriately */ if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner)) return -EINVAL; - /* - * Temporarily enforce global restriction to a single driver. This was - * already the de-facto behaviour, since any possible combination of - * existing drivers would compete for at least the PCI or platform bus. - */ - if (iommu_buses[0]->iommu_ops && iommu_buses[0]->iommu_ops != ops) - return -EBUSY; iommu->ops = ops; if (hwdev) @@ -274,10 +266,8 @@ int iommu_device_register(struct iommu_device *iommu, list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); - for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) { - iommu_buses[i]->iommu_ops = ops; + for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) err = bus_iommu_probe(iommu_buses[i]); - } if (err) iommu_device_unregister(iommu); return err; @@ -292,6 +282,10 @@ void iommu_device_unregister(struct iommu_device *iommu) spin_lock(&iommu_device_lock); list_del(&iommu->list); spin_unlock(&iommu_device_lock); + + /* Pairs with the alloc in generic_single_device_group() */ + iommu_group_put(iommu->singleton_group); + iommu->singleton_group = NULL; } EXPORT_SYMBOL_GPL(iommu_device_unregister); @@ -326,7 +320,6 @@ int iommu_device_register_bus(struct iommu_device *iommu, list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); - bus->iommu_ops = ops; err = bus_iommu_probe(bus); if (err) { iommu_device_unregister_bus(iommu, bus, nb); @@ -365,6 +358,15 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +/* + * Internal equivalent of device_iommu_mapped() for when we care that a device + * actually has API ops, and don't want false positives from VFIO-only groups. + */ +static bool dev_has_iommu(struct device *dev) +{ + return dev->iommu && dev->iommu->iommu_dev; +} + static u32 dev_iommu_get_max_pasids(struct device *dev) { u32 max_pasids = 0, bits = 0; @@ -415,6 +417,7 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) ret = PTR_ERR(iommu_dev); goto err_module_put; } + dev->iommu->iommu_dev = iommu_dev; ret = iommu_device_link(iommu_dev, dev); if (ret) @@ -429,7 +432,6 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) } dev->iommu_group = group; - dev->iommu->iommu_dev = iommu_dev; dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); if (ops->is_attach_deferred) dev->iommu->attach_deferred = ops->is_attach_deferred(dev); @@ -443,6 +445,7 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) err_module_put: module_put(ops->owner); err_free: + dev->iommu->iommu_dev = NULL; dev_iommu_free(dev); return ret; } @@ -458,24 +461,13 @@ static void iommu_deinit_device(struct device *dev) /* * release_device() must stop using any attached domain on the device. - * If there are still other devices in the group, they are not affected + * If there are still other devices in the group they are not effected * by this callback. * - * If the iommu driver provides release_domain, the core code ensures - * that domain is attached prior to calling release_device. Drivers can - * use this to enforce a translation on the idle iommu. Typically, the - * global static blocked_domain is a good choice. - * - * Otherwise, the iommu driver must set the device to either an identity - * or a blocking translation in release_device() and stop using any - * domain pointer, as it is going to be freed. - * - * Regardless, if a delayed attach never occurred, then the release - * should still avoid touching any hardware configuration either. + * The IOMMU driver must set the device to either an identity or + * blocking translation and stop using any domain pointer, as it is + * going to be freed. */ - if (!dev->iommu->attach_deferred && ops->release_domain) - ops->release_domain->ops->attach_dev(ops->release_domain, dev); - if (ops->release_device) ops->release_device(dev); @@ -505,11 +497,26 @@ DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { - const struct iommu_ops *ops = dev->bus->iommu_ops; + const struct iommu_ops *ops; + struct iommu_fwspec *fwspec; struct iommu_group *group; struct group_device *gdev; int ret; + /* + * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU + * instances with non-NULL fwnodes, and client devices should have been + * identified with a fwspec by this point. Otherwise, we can currently + * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can + * be present, and that any of their registered instances has suitable + * ops for probing, and thus cheekily co-opt the same mechanism. + */ + fwspec = dev_iommu_fwspec_get(dev); + if (fwspec && fwspec->ops) + ops = fwspec->ops; + else + ops = iommu_ops_from_fwnode(NULL); + if (!ops) return -ENODEV; /* @@ -634,7 +641,7 @@ static void __iommu_group_remove_device(struct device *dev) list_del(&device->list); __iommu_group_free_device(group, device); - if (dev->iommu && dev->iommu->iommu_dev) + if (dev_has_iommu(dev)) iommu_deinit_device(dev); else dev->iommu_group = NULL; @@ -833,7 +840,7 @@ int iommu_get_group_resv_regions(struct iommu_group *group, * Non-API groups still expose reserved_regions in sysfs, * so filter out calls that get here that way. */ - if (!device->dev->iommu) + if (!dev_has_iommu(device->dev)) break; INIT_LIST_HEAD(&dev_resv_regions); @@ -1250,6 +1257,12 @@ void iommu_group_remove_device(struct device *dev) } EXPORT_SYMBOL_GPL(iommu_group_remove_device); +static struct device *iommu_group_first_dev(struct iommu_group *group) +{ + lockdep_assert_held(&group->mutex); + return list_first_entry(&group->devices, struct group_device, list)->dev; +} + /** * iommu_group_for_each_dev - iterate over each device in the group * @group: the group @@ -1667,6 +1680,27 @@ struct iommu_group *generic_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(generic_device_group); +/* + * Generic device_group call-back function. It just allocates one + * iommu-group per iommu driver instance shared by every device + * probed by that iommu driver. + */ +struct iommu_group *generic_single_device_group(struct device *dev) +{ + struct iommu_device *iommu = dev->iommu->iommu_dev; + + if (!iommu->singleton_group) { + struct iommu_group *group; + + group = iommu_group_alloc(); + if (IS_ERR(group)) + return group; + iommu->singleton_group = group; + } + return iommu_group_ref_get(iommu->singleton_group); +} +EXPORT_SYMBOL_GPL(generic_single_device_group); + /* * Use standard PCI bus topology, isolation features, and DMA alias quirks * to find or create an IOMMU group for a device. @@ -1756,23 +1790,6 @@ __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) return __iommu_group_domain_alloc(group, req_type); } -/* - * Returns the iommu_ops for the devices in an iommu group. - * - * It is assumed that all devices in an iommu group are managed by a single - * IOMMU unit. Therefore, this returns the dev_iommu_ops of the first device - * in the group. - */ -static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) -{ - struct group_device *device = - list_first_entry(&group->devices, struct group_device, list); - - lockdep_assert_held(&group->mutex); - - return dev_iommu_ops(device->dev); -} - /* * req_type of 0 means "auto" which means to select a domain based on * iommu_def_domain_type or what the driver actually supports. @@ -1780,7 +1797,7 @@ static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) static struct iommu_domain * iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { - const struct iommu_ops *ops = group_iommu_ops(group); + const struct iommu_ops *ops = dev_iommu_ops(iommu_group_first_dev(group)); struct iommu_domain *dom; lockdep_assert_held(&group->mutex); @@ -1801,15 +1818,15 @@ iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) /* The driver gave no guidance on what type to use, try the default */ dom = __iommu_group_alloc_default_domain(group, iommu_def_domain_type); - if (dom) + if (!IS_ERR(dom)) return dom; /* Otherwise IDENTITY and DMA_FQ defaults will try DMA */ if (iommu_def_domain_type == IOMMU_DOMAIN_DMA) - return NULL; + return ERR_PTR(-EINVAL); dom = __iommu_group_alloc_default_domain(group, IOMMU_DOMAIN_DMA); - if (!dom) - return NULL; + if (IS_ERR(dom)) + return dom; pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA", iommu_def_domain_type, group->name); @@ -1860,7 +1877,7 @@ static int iommu_bus_notifier(struct notifier_block *nb, static int iommu_get_def_domain_type(struct iommu_group *group, struct device *dev, int cur_type) { - const struct iommu_ops *ops = group_iommu_ops(group); + const struct iommu_ops *ops = dev_iommu_ops(dev); int type; if (!ops->def_domain_type) @@ -2009,9 +2026,28 @@ int bus_iommu_probe(const struct bus_type *bus) return 0; } +/** + * iommu_present() - make platform-specific assumptions about an IOMMU + * @bus: bus to check + * + * Do not use this function. You want device_iommu_mapped() instead. + * + * Return: true if some IOMMU is present and aware of devices on the given bus; + * in general it may not be the only IOMMU, and it may not have anything to do + * with whatever device you are ultimately interested in. + */ bool iommu_present(const struct bus_type *bus) { - return bus->iommu_ops != NULL; + bool ret = false; + + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { + if (iommu_buses[i] == bus) { + spin_lock(&iommu_device_lock); + ret = !list_empty(&iommu_device_list); + spin_unlock(&iommu_device_lock); + } + } + return ret; } EXPORT_SYMBOL_GPL(iommu_present); @@ -2027,7 +2063,7 @@ bool device_iommu_capable(struct device *dev, enum iommu_cap cap) { const struct iommu_ops *ops; - if (!dev->iommu || !dev->iommu->iommu_dev) + if (!dev_has_iommu(dev)) return false; ops = dev_iommu_ops(dev); @@ -2100,12 +2136,20 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, else if (ops->domain_alloc) domain = ops->domain_alloc(alloc_type); else - return NULL; + return ERR_PTR(-EOPNOTSUPP); + /* + * Many domain_alloc ops now return ERR_PTR, make things easier for the + * driver by accepting ERR_PTR from all domain_alloc ops instead of + * having two rules. + */ + if (IS_ERR(domain)) + return domain; if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->type = type; + domain->owner = ops; /* * If not already set, assume all sizes by default; the driver * may override this later @@ -2116,9 +2160,14 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, if (!domain->ops) domain->ops = ops->default_domain_ops; - if (iommu_is_dma_domain(domain) && iommu_get_dma_cookie(domain)) { - iommu_domain_free(domain); - domain = NULL; + if (iommu_is_dma_domain(domain)) { + int rc; + + rc = iommu_get_dma_cookie(domain); + if (rc) { + iommu_domain_free(domain); + return ERR_PTR(rc); + } } return domain; } @@ -2126,19 +2175,40 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, static struct iommu_domain * __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) { - struct device *dev = - list_first_entry(&group->devices, struct group_device, list) - ->dev; + struct device *dev = iommu_group_first_dev(group); + + return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); +} + +static int __iommu_domain_alloc_dev(struct device *dev, void *data) +{ + const struct iommu_ops **ops = data; + + if (!dev_has_iommu(dev)) + return 0; + + if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev), + "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n", + dev_bus_name(dev))) + return -EBUSY; - return __iommu_domain_alloc(group_iommu_ops(group), dev, type); + *ops = dev_iommu_ops(dev); + return 0; } struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) { - if (bus == NULL || bus->iommu_ops == NULL) + const struct iommu_ops *ops = NULL; + int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev); + struct iommu_domain *domain; + + if (err || !ops) return NULL; - return __iommu_domain_alloc(bus->iommu_ops, NULL, - IOMMU_DOMAIN_UNMANAGED); + + domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED); + if (IS_ERR(domain)) + return NULL; + return domain; } EXPORT_SYMBOL_GPL(iommu_domain_alloc); @@ -2198,10 +2268,10 @@ static int __iommu_attach_device(struct iommu_domain *domain, */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; int ret; - group = iommu_group_get(dev); if (!group) return -ENODEV; @@ -2218,8 +2288,6 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) out_unlock: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device); @@ -2234,9 +2302,9 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) void iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; - group = iommu_group_get(dev); if (!group) return; @@ -2248,24 +2316,18 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev) out_unlock: mutex_unlock(&group->mutex); - iommu_group_put(group); } EXPORT_SYMBOL_GPL(iommu_detach_device); struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) { - struct iommu_domain *domain; - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; - group = iommu_group_get(dev); if (!group) return NULL; - domain = group->domain; - - iommu_group_put(group); - - return domain; + return group->domain; } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); @@ -2281,10 +2343,16 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { + struct device *dev; + if (group->domain && group->domain != group->default_domain && group->domain != group->blocking_domain) return -EBUSY; + dev = iommu_group_first_dev(group); + if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner) + return -EINVAL; + return __iommu_group_set_domain(group, domain); } @@ -2539,30 +2607,6 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, return pgsize; } -static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, - gfp_t gfp, size_t *mapped) -{ - const struct iommu_domain_ops *ops = domain->ops; - size_t pgsize, count; - int ret; - - pgsize = iommu_pgsize(domain, iova, paddr, size, &count); - - pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", - iova, &paddr, pgsize, count); - - if (ops->map_pages) { - ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, - gfp, mapped); - } else { - ret = ops->map(domain, iova, paddr, pgsize, prot, gfp); - *mapped = ret ? 0 : pgsize; - } - - return ret; -} - static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { @@ -2573,13 +2617,12 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t orig_paddr = paddr; int ret = 0; - if (unlikely(!(ops->map || ops->map_pages) || - domain->pgsize_bitmap == 0UL)) - return -ENODEV; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; + if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) + return -ENODEV; + /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2597,10 +2640,14 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t mapped = 0; + size_t pgsize, count, mapped = 0; + + pgsize = iommu_pgsize(domain, iova, paddr, size, &count); - ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp, - &mapped); + pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", + iova, &paddr, pgsize, count); + ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, + gfp, &mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. @@ -2637,25 +2684,21 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, return -EINVAL; ret = __iommu_map(domain, iova, paddr, size, prot, gfp); - if (ret == 0 && ops->iotlb_sync_map) - ops->iotlb_sync_map(domain, iova, size); + if (ret == 0 && ops->iotlb_sync_map) { + ret = ops->iotlb_sync_map(domain, iova, size); + if (ret) + goto out_err; + } return ret; -} -EXPORT_SYMBOL_GPL(iommu_map); -static size_t __iommu_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *iotlb_gather) -{ - const struct iommu_domain_ops *ops = domain->ops; - size_t pgsize, count; +out_err: + /* undo mappings already done */ + iommu_unmap(domain, iova, size); - pgsize = iommu_pgsize(domain, iova, iova, size, &count); - return ops->unmap_pages ? - ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) : - ops->unmap(domain, iova, pgsize, iotlb_gather); + return ret; } +EXPORT_SYMBOL_GPL(iommu_map); static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, @@ -2666,11 +2709,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long orig_iova = iova; unsigned int min_pagesz; - if (unlikely(!(ops->unmap || ops->unmap_pages) || - domain->pgsize_bitmap == 0UL)) + if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return 0; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) + if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) return 0; /* find out the minimum page size supported */ @@ -2694,9 +2736,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain, * or we hit an area that isn't mapped. */ while (unmapped < size) { - unmapped_page = __iommu_unmap_pages(domain, iova, - size - unmapped, - iotlb_gather); + size_t pgsize, count; + + pgsize = iommu_pgsize(domain, iova, iova, size - unmapped, &count); + unmapped_page = ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather); if (!unmapped_page) break; @@ -2779,8 +2822,11 @@ ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, sg = sg_next(sg); } - if (ops->iotlb_sync_map) - ops->iotlb_sync_map(domain, iova, mapped); + if (ops->iotlb_sync_map) { + ret = ops->iotlb_sync_map(domain, iova, mapped); + if (ret) + goto out_err; + } return mapped; out_err: @@ -3025,8 +3071,8 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); */ int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) { - if (dev->iommu && dev->iommu->iommu_dev) { - const struct iommu_ops *ops = dev->iommu->iommu_dev->ops; + if (dev_has_iommu(dev)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->dev_enable_feat) return ops->dev_enable_feat(dev, feat); @@ -3041,8 +3087,8 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature); */ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) { - if (dev->iommu && dev->iommu->iommu_dev) { - const struct iommu_ops *ops = dev->iommu->iommu_dev->ops; + if (dev_has_iommu(dev)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->dev_disable_feat) return ops->dev_disable_feat(dev, feat); @@ -3079,8 +3125,8 @@ static int iommu_setup_default_domain(struct iommu_group *group, return -EINVAL; dom = iommu_group_alloc_default_domain(group, req_type); - if (!dom) - return -ENODEV; + if (IS_ERR(dom)) + return PTR_ERR(dom); if (group->default_domain == dom) return 0; @@ -3223,24 +3269,6 @@ static ssize_t iommu_group_store_type(struct iommu_group *group, return ret ?: count; } -static bool iommu_is_default_domain(struct iommu_group *group) -{ - if (group->domain == group->default_domain) - return true; - - /* - * If the default domain was set to identity and it is still an identity - * domain then we consider this a pass. This happens because of - * amd_iommu_init_device() replacing the default idenytity domain with an - * identity domain that has a different configuration for AMDGPU. - */ - if (group->default_domain && - group->default_domain->type == IOMMU_DOMAIN_IDENTITY && - group->domain && group->domain->type == IOMMU_DOMAIN_IDENTITY) - return true; - return false; -} - /** * iommu_device_use_default_domain() - Device driver wants to handle device * DMA through the kernel DMA API. @@ -3251,7 +3279,8 @@ static bool iommu_is_default_domain(struct iommu_group *group) */ int iommu_device_use_default_domain(struct device *dev) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller is the driver core during the pre-probe path */ + struct iommu_group *group = dev->iommu_group; int ret = 0; if (!group) @@ -3264,7 +3293,7 @@ int iommu_device_use_default_domain(struct device *dev) goto unlock_out; } if (group->owner_cnt) { - if (group->owner || !iommu_is_default_domain(group) || + if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; @@ -3275,8 +3304,6 @@ int iommu_device_use_default_domain(struct device *dev) unlock_out: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } @@ -3290,7 +3317,8 @@ int iommu_device_use_default_domain(struct device *dev) */ void iommu_device_unuse_default_domain(struct device *dev) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller is the driver core during the post-probe path */ + struct iommu_group *group = dev->iommu_group; if (!group) return; @@ -3300,26 +3328,27 @@ void iommu_device_unuse_default_domain(struct device *dev) group->owner_cnt--; mutex_unlock(&group->mutex); - iommu_group_put(group); } static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { + struct iommu_domain *domain; + if (group->blocking_domain) return 0; - group->blocking_domain = - __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); - if (!group->blocking_domain) { + domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); + if (IS_ERR(domain)) { /* * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED * create an empty domain instead. */ - group->blocking_domain = __iommu_group_domain_alloc( - group, IOMMU_DOMAIN_UNMANAGED); - if (!group->blocking_domain) - return -EINVAL; + domain = __iommu_group_domain_alloc(group, + IOMMU_DOMAIN_UNMANAGED); + if (IS_ERR(domain)) + return PTR_ERR(domain); } + group->blocking_domain = domain; return 0; } @@ -3384,13 +3413,13 @@ EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); */ int iommu_device_claim_dma_owner(struct device *dev, void *owner) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; int ret = 0; if (WARN_ON(!owner)) return -EINVAL; - group = iommu_group_get(dev); if (!group) return -ENODEV; @@ -3407,8 +3436,6 @@ int iommu_device_claim_dma_owner(struct device *dev, void *owner) ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); @@ -3446,7 +3473,8 @@ EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); */ void iommu_device_release_dma_owner(struct device *dev) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); if (group->owner_cnt > 1) @@ -3454,7 +3482,6 @@ void iommu_device_release_dma_owner(struct device *dev) else __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); - iommu_group_put(group); } EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); @@ -3526,17 +3553,20 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { - struct iommu_group *group; + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; void *curr; int ret; if (!domain->ops->set_dev_pasid) return -EOPNOTSUPP; - group = iommu_group_get(dev); if (!group) return -ENODEV; + if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner) + return -EINVAL; + mutex_lock(&group->mutex); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); if (curr) { @@ -3549,8 +3579,6 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, xa_erase(&group->pasid_array, pasid); out_unlock: mutex_unlock(&group->mutex); - iommu_group_put(group); - return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); @@ -3567,14 +3595,13 @@ EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { - struct iommu_group *group = iommu_group_get(dev); + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); __iommu_remove_group_pasid(group, pasid); WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); mutex_unlock(&group->mutex); - - iommu_group_put(group); } EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); @@ -3596,10 +3623,10 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, unsigned int type) { + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; struct iommu_domain *domain; - struct iommu_group *group; - group = iommu_group_get(dev); if (!group) return NULL; @@ -3608,7 +3635,6 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, if (type && domain && domain->type != type) domain = ERR_PTR(-EBUSY); xa_unlock(&group->pasid_array); - iommu_group_put(group); return domain; } diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index cd0c96bf86e1a0f2f2756cfd7afbaa3385536bf3..dbcb027857b6bb2cd00c19dc28eb6c75ba53a221 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); static int iommufd_group_setup_msi(struct iommufd_group *igroup, - struct iommufd_hw_pagetable *hwpt) + struct iommufd_hwpt_paging *hwpt_paging) { phys_addr_t sw_msi_start = igroup->sw_msi_start; int rc; @@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * matches what the IRQ layer actually expects in a newly created * domain. */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt_paging->common.domain, + sw_msi_start); if (rc) return rc; @@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * iommu_get_msi_cookie() can only be called once per domain, * it returns -EBUSY on later calls. */ - hwpt->msi_cookie = true; + hwpt_paging->msi_cookie = true; + } + return 0; +} + +static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_device *idev) +{ + int rc; + + lockdep_assert_held(&idev->igroup->lock); + + rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, + idev->dev, + &idev->igroup->sw_msi_start); + if (rc) + return rc; + + if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); + if (rc) { + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, + idev->dev); + return rc; + } } return 0; } @@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - /* Try to upgrade the domain we have */ - if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (hwpt_is_paging(hwpt)) { + rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); if (rc) goto err_unlock; } - rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); - if (rc) - goto err_unlock; - /* * Only attach to the group once for the first device that is in the * group. All the other devices will follow this attachment. The user @@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; @@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&idev->igroup->lock); return 0; err_unresv: - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); return rc; @@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) iommu_detach_group(hwpt->domain, idev->igroup->group); idev->igroup->hwpt = NULL; } - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); mutex_unlock(&idev->igroup->lock); /* Caller must destroy hwpt */ @@ -418,14 +437,55 @@ static bool iommufd_device_is_attached(struct iommufd_device *idev) return false; } +static void +iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_device *cur; + + lockdep_assert_held(&igroup->lock); + + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); +} + +static int +iommufd_group_do_replace_paging(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; + struct iommufd_device *cur; + int rc; + + lockdep_assert_held(&igroup->lock); + + if (!hwpt_is_paging(old_hwpt) || + hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt_paging->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(igroup, hwpt_paging); + if (rc) + goto err_unresv; + return 0; + +err_unresv: + iommufd_group_remove_reserved_iova(igroup, hwpt_paging); + return rc; +} + static struct iommufd_hw_pagetable * iommufd_device_do_replace(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt) { struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; - unsigned int num_devices = 0; - struct iommufd_device *cur; + unsigned int num_devices; int rc; mutex_lock(&idev->igroup->lock); @@ -445,42 +505,27 @@ iommufd_device_do_replace(struct iommufd_device *idev, return NULL; } - /* Try to upgrade the domain we have */ - list_for_each_entry(cur, &igroup->device_list, group_item) { - num_devices++; - if (cur->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); - if (rc) - goto err_unlock; - } - } - old_hwpt = igroup->hwpt; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { - rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, cur->dev, NULL); - if (rc) - goto err_unresv; - } + if (hwpt_is_paging(hwpt)) { + rc = iommufd_group_do_replace_paging(igroup, + to_hwpt_paging(hwpt)); + if (rc) + goto err_unlock; } - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); if (rc) goto err_unresv; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, - cur->dev); - } + if (hwpt_is_paging(old_hwpt) && + (!hwpt_is_paging(hwpt) || + to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); igroup->hwpt = hwpt; + num_devices = list_count_nodes(&igroup->device_list); /* * Move the refcounts held by the device_list to the new hwpt. Retain a * refcount for this thread as the caller will free it. @@ -494,8 +539,9 @@ iommufd_device_do_replace(struct iommufd_device *idev, /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); + if (hwpt_is_paging(hwpt)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); @@ -523,6 +569,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, */ bool immediate_attach = do_attach == iommufd_device_do_attach; struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; /* @@ -531,10 +578,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, * other. */ mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->auto_domain) + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->auto_domain) continue; + hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; destroy_hwpt = (*do_attach)(idev, hwpt); @@ -555,12 +603,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } - hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, - 0, immediate_attach); - if (IS_ERR(hwpt)) { - destroy_hwpt = ERR_CAST(hwpt); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, + immediate_attach, NULL); + if (IS_ERR(hwpt_paging)) { + destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; } + hwpt = &hwpt_paging->common; if (!immediate_attach) { destroy_hwpt = (*do_attach)(idev, hwpt); @@ -570,7 +619,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, destroy_hwpt = NULL; } - hwpt->auto_domain = true; + hwpt_paging->auto_domain = true; *pt_id = hwpt->obj.id; iommufd_object_finalize(idev->ictx, &hwpt->obj); @@ -595,7 +644,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return PTR_ERR(pt_obj); switch (pt_obj->type) { - case IOMMUFD_OBJ_HW_PAGETABLE: { + case IOMMUFD_OBJ_HWPT_NESTED: + case IOMMUFD_OBJ_HWPT_PAGING: { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); @@ -633,8 +683,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, /** * iommufd_device_attach - Connect a device to an iommu_domain * @idev: device to attach - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This connects the device to an iommu_domain, either automatically or manually * selected. Once this completes the device could do DMA. @@ -662,8 +712,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); /** * iommufd_device_replace - Change the device's iommu_domain * @idev: device to change - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This is the same as:: * diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 72a5269984b0183ba262421486a09c0ccebf1077..5be7f513b622d4c4eabecad230873a880ae9a4ca 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -8,61 +8,84 @@ #include "../iommu-priv.h" #include "iommufd_private.h" -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); - if (!list_empty(&hwpt->hwpt_item)) { - mutex_lock(&hwpt->ioas->mutex); - list_del(&hwpt->hwpt_item); - mutex_unlock(&hwpt->ioas->mutex); + if (!list_empty(&hwpt_paging->hwpt_item)) { + mutex_lock(&hwpt_paging->ioas->mutex); + list_del(&hwpt_paging->hwpt_item); + mutex_unlock(&hwpt_paging->ioas->mutex); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - if (hwpt->domain) - iommu_domain_free(hwpt->domain); + if (hwpt_paging->common.domain) + iommu_domain_free(hwpt_paging->common.domain); - refcount_dec(&hwpt->ioas->obj.users); + refcount_dec(&hwpt_paging->ioas->obj.users); } -void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +void iommufd_hwpt_paging_abort(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); /* The ioas->mutex must be held until finalize is called. */ - lockdep_assert_held(&hwpt->ioas->mutex); + lockdep_assert_held(&hwpt_paging->ioas->mutex); - if (!list_empty(&hwpt->hwpt_item)) { - list_del_init(&hwpt->hwpt_item); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + if (!list_empty(&hwpt_paging->hwpt_item)) { + list_del_init(&hwpt_paging->hwpt_item); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - iommufd_hw_pagetable_destroy(obj); + iommufd_hwpt_paging_destroy(obj); } -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) { - if (hwpt->enforce_cache_coherency) + struct iommufd_hwpt_nested *hwpt_nested = + container_of(obj, struct iommufd_hwpt_nested, common.obj); + + if (hwpt_nested->common.domain) + iommu_domain_free(hwpt_nested->common.domain); + + refcount_dec(&hwpt_nested->parent->common.obj.users); +} + +void iommufd_hwpt_nested_abort(struct iommufd_object *obj) +{ + iommufd_hwpt_nested_destroy(obj); +} + +static int +iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommu_domain *paging_domain = hwpt_paging->common.domain; + + if (hwpt_paging->enforce_cache_coherency) return 0; - if (hwpt->domain->ops->enforce_cache_coherency) - hwpt->enforce_cache_coherency = - hwpt->domain->ops->enforce_cache_coherency( - hwpt->domain); - if (!hwpt->enforce_cache_coherency) + if (paging_domain->ops->enforce_cache_coherency) + hwpt_paging->enforce_cache_coherency = + paging_domain->ops->enforce_cache_coherency( + paging_domain); + if (!hwpt_paging->enforce_cache_coherency) return -EINVAL; return 0; } /** - * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt + * @user_data: The user provided driver specific data describing the domain to + * create * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain @@ -72,36 +95,47 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on * the returned hwpt. */ -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach) +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach, + const struct iommu_user_data *user_data) { + const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; int rc; lockdep_assert_held(&ioas->mutex); - if (flags && !ops->domain_alloc_user) + if ((flags || user_data) && !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); - hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); - if (IS_ERR(hwpt)) - return hwpt; + hwpt_paging = __iommufd_object_alloc( + ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); + if (IS_ERR(hwpt_paging)) + return ERR_CAST(hwpt_paging); + hwpt = &hwpt_paging->common; - INIT_LIST_HEAD(&hwpt->hwpt_item); + INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); - hwpt->ioas = ioas; + hwpt_paging->ioas = ioas; + hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (ops->domain_alloc_user) { - hwpt->domain = ops->domain_alloc_user(idev->dev, flags); + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, + user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } + hwpt->domain->owner = ops; } else { hwpt->domain = iommu_domain_alloc(idev->dev->bus); if (!hwpt->domain) { @@ -116,9 +150,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * doing any maps. It is an iommu driver bug to report * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on * a new domain. + * + * The cache coherency mode must be configured here and unchanged later. + * Note that a HWPT (non-CC) created for a device (non-CC) can be later + * reused by another device (either non-CC or CC). However, A HWPT (CC) + * created for a device (CC) cannot be reused by another device (non-CC) + * but only devices (CC). Instead user space in this case would need to + * allocate a separate HWPT (non-CC). */ if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging); if (WARN_ON(rc)) goto out_abort; } @@ -135,11 +176,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } - rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain); if (rc) goto out_detach; - list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); - return hwpt; + list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list); + return hwpt_paging; out_detach: if (immediate_attach) @@ -149,35 +190,121 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, return ERR_PTR(rc); } +/** + * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device + * @ictx: iommufd context + * @parent: Parent PAGING-type hwpt to associate the domain with + * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as + * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of + * being a parent. + */ +static struct iommufd_hwpt_nested * +iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *parent, + struct iommufd_device *idev, u32 flags, + const struct iommu_user_data *user_data) +{ + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags || !user_data->len || !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (parent->auto_domain || !parent->nest_parent) + return ERR_PTR(-EINVAL); + + hwpt_nested = __iommufd_object_alloc( + ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + refcount_inc(&parent->common.obj.users); + hwpt_nested->parent = parent; + + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, + parent->common.domain, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + hwpt->domain->owner = ops; + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas = NULL; + struct iommufd_object *pt_obj; struct iommufd_device *idev; - struct iommufd_ioas *ioas; int rc; - if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING)) || - cmd->__reserved) + if (cmd->__reserved) return -EOPNOTSUPP; + if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) + return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); - ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); - if (IS_ERR(ioas)) { - rc = PTR_ERR(ioas); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = -EINVAL; goto out_put_idev; } - mutex_lock(&ioas->mutex); - hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, - idev, cmd->flags, false); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); - goto out_unlock; + if (pt_obj->type == IOMMUFD_OBJ_IOAS) { + struct iommufd_hwpt_paging *hwpt_paging; + + ioas = container_of(pt_obj, struct iommufd_ioas, obj); + mutex_lock(&ioas->mutex); + hwpt_paging = iommufd_hwpt_paging_alloc( + ucmd->ictx, ioas, idev, cmd->flags, false, + user_data.len ? &user_data : NULL); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_unlock; + } + hwpt = &hwpt_paging->common; + } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_nested *hwpt_nested; + + hwpt_nested = iommufd_hwpt_nested_alloc( + ucmd->ictx, + container_of(pt_obj, struct iommufd_hwpt_paging, + common.obj), + idev, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; + } else { + rc = -EINVAL; + goto out_put_pt; } cmd->out_hwpt_id = hwpt->obj.id; @@ -190,8 +317,10 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) out_hwpt: iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); out_unlock: - mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + if (ioas) + mutex_unlock(&ioas->mutex); +out_put_pt: + iommufd_put_object(pt_obj); out_put_idev: iommufd_put_object(&idev->obj); return rc; @@ -200,7 +329,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; bool enable; @@ -208,23 +337,24 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) return rc; - hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) - return PTR_ERR(hwpt); + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); - ioas = hwpt->ioas; + ioas = hwpt_paging->ioas; enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; - rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt->domain, enable); + rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, + enable); - iommufd_put_object(&hwpt->obj); + iommufd_put_object(&hwpt_paging->common.obj); return rc; } int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; @@ -232,14 +362,14 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) cmd->__reserved) return -EOPNOTSUPP; - hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) - return PTR_ERR(hwpt); + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); - ioas = hwpt->ioas; - rc = iopt_read_and_clear_dirty_data(&ioas->iopt, hwpt->domain, - cmd->flags, cmd); + ioas = hwpt_paging->ioas; + rc = iopt_read_and_clear_dirty_data( + &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); - iommufd_put_object(&hwpt->obj); + iommufd_put_object(&hwpt_paging->common.obj); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 034129130db3757ef58ee8a789747dcaa3586d7e..a74cfefffbc6c5045c7b22063f66978f1f275e59 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -123,7 +123,8 @@ enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST @@ -181,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); -#define iommufd_object_alloc(ictx, ptr, type) \ +#define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ @@ -190,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +#define iommufd_object_alloc(ictx, ptr, type) \ + __iommufd_object_alloc(ictx, ptr, type, obj) + /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The @@ -243,46 +247,75 @@ int iommufd_check_iova_range(struct io_pagetable *iopt, */ struct iommufd_hw_pagetable { struct iommufd_object obj; - struct iommufd_ioas *ioas; struct iommu_domain *domain; +}; + +struct iommufd_hwpt_paging { + struct iommufd_hw_pagetable common; + struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; + bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; }; -static inline struct iommufd_hw_pagetable * -iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +struct iommufd_hwpt_nested { + struct iommufd_hw_pagetable common; + struct iommufd_hwpt_paging *parent; +}; + +static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) +{ + return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; +} + +static inline struct iommufd_hwpt_paging * +to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) +{ + return container_of(hwpt, struct iommufd_hwpt_paging, common); +} + +static inline struct iommufd_hwpt_paging * +iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, - IOMMUFD_OBJ_HW_PAGETABLE), - struct iommufd_hw_pagetable, obj); + IOMMUFD_OBJ_HWPT_PAGING), + struct iommufd_hwpt_paging, common.obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach); -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach, + const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); -void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); +void iommufd_hwpt_paging_abort(struct iommufd_object *obj); +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); +void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { - lockdep_assert_not_held(&hwpt->ioas->mutex); - if (hwpt->auto_domain) - iommufd_object_deref_user(ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); + if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); + + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); + + if (hwpt_paging->auto_domain) { + iommufd_object_deref_user(ictx, &hwpt->obj); + return; + } + } + refcount_dec(&hwpt->obj.users); } struct iommufd_group { diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 1f2e93d3d4e87f758f0eb67e93343bf86f813f16..7910fbe1962d78b9c8b65726fad12e75c0fd4a22 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -46,6 +46,11 @@ enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, }; +enum { + MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3, + MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -130,4 +135,17 @@ struct iommu_test_hw_info { __u32 test_reg; }; +/* Should not be equal to any defined value in enum iommu_hwpt_data_type */ +#define IOMMU_HWPT_DATA_SELFTEST 0xdead +#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef + +/** + * struct iommu_hwpt_selftest + * + * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT + */ +struct iommu_hwpt_selftest { + __u32 iotlb; +}; + #endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index d50f42a730aa36c63b408be1174d8f553880467b..45b9d40773b13a4255c3d6114240fd7e2a469eef 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -488,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, - [IOMMUFD_OBJ_HW_PAGETABLE] = { - .destroy = iommufd_hw_pagetable_destroy, - .abort = iommufd_hw_pagetable_abort, + [IOMMUFD_OBJ_HWPT_PAGING] = { + .destroy = iommufd_hwpt_paging_destroy, + .abort = iommufd_hwpt_paging_abort, + }, + [IOMMUFD_OBJ_HWPT_NESTED] = { + .destroy = iommufd_hwpt_nested_destroy, + .abort = iommufd_hwpt_nested_abort, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 8ea74157495d3f4ee387d652a4b75bd692d9fddd..67a7591390b94ee521b35343b85e6c67b3102010 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -20,6 +20,8 @@ static DECLARE_FAULT_ATTR(fail_iommufd); static struct dentry *dbgfs_root; static struct platform_device *selftest_iommu_dev; +static const struct iommu_ops mock_ops; +static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; @@ -108,6 +110,12 @@ struct mock_iommu_domain { struct xarray pfns; }; +struct mock_iommu_domain_nested { + struct iommu_domain domain; + struct mock_iommu_domain *parent; + u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; +}; + enum selftest_obj_type { TYPE_IDEV, }; @@ -226,8 +234,6 @@ const struct iommu_dirty_ops dirty_ops = { .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; -static const struct iommu_ops mock_ops; - static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) { struct mock_iommu_domain *mock; @@ -245,25 +251,69 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags) +__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, + const struct iommu_hwpt_selftest *user_cfg) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - struct iommu_domain *domain; + struct mock_iommu_domain_nested *mock_nested; + int i; - if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); + mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); + if (!mock_nested) + return ERR_PTR(-ENOMEM); + mock_nested->parent = mock_parent; + mock_nested->domain.ops = &domain_nested_ops; + mock_nested->domain.type = IOMMU_DOMAIN_NESTED; + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) + mock_nested->iotlb[i] = user_cfg->iotlb; + return &mock_nested->domain; +} + +static struct iommu_domain * +mock_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct mock_iommu_domain *mock_parent; + struct iommu_hwpt_selftest user_cfg; + int rc; + + /* must be mock_domain */ + if (!parent) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_domain *domain; + + if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + if (user_data || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + domain = mock_domain_alloc_paging(NULL); + if (!domain) + return ERR_PTR(-ENOMEM); + if (has_dirty_flag) + container_of(domain, struct mock_iommu_domain, domain) + ->domain.dirty_ops = &dirty_ops; + return domain; + } - if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && - (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) + /* must be mock_domain_nested */ + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) return ERR_PTR(-EOPNOTSUPP); + if (!parent || parent->ops != mock_ops.default_domain_ops) + return ERR_PTR(-EINVAL); - domain = mock_domain_alloc_paging(NULL); - if (domain && !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) - domain->dirty_ops = &dirty_ops; - if (!domain) - domain = ERR_PTR(-ENOMEM); - return domain; + mock_parent = container_of(parent, struct mock_iommu_domain, domain); + if (!mock_parent) + return ERR_PTR(-EINVAL); + + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + + return __mock_domain_alloc_nested(mock_parent, &user_cfg); } static void mock_domain_free(struct iommu_domain *domain) @@ -430,19 +480,41 @@ static const struct iommu_ops mock_ops = { }, }; +static void mock_domain_free_nested(struct iommu_domain *domain) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + + kfree(mock_nested); +} + +static struct iommu_domain_ops domain_nested_ops = { + .free = mock_domain_free_nested, + .attach_dev = mock_domain_nop_attach, +}; + static inline struct iommufd_hw_pagetable * -get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, - struct mock_iommu_domain **mock) +__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type) { - struct iommufd_hw_pagetable *hwpt; struct iommufd_object *obj; - obj = iommufd_get_object(ucmd->ictx, mockpt_id, - IOMMUFD_OBJ_HW_PAGETABLE); + obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type); if (IS_ERR(obj)) return ERR_CAST(obj); - hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); - if (hwpt->domain->ops != mock_ops.default_domain_ops) { + return container_of(obj, struct iommufd_hw_pagetable, obj); +} + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || + hwpt->domain->ops != mock_ops.default_domain_ops) { iommufd_put_object(&hwpt->obj); return ERR_PTR(-EINVAL); } @@ -450,6 +522,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; } +static inline struct iommufd_hw_pagetable * +get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain_nested **mock_nested) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || + hwpt->domain->ops != &domain_nested_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock_nested = container_of(hwpt->domain, + struct mock_iommu_domain_nested, domain); + return hwpt; +} + struct mock_bus_type { struct bus_type bus; struct notifier_block nb; diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index 6c810bf80f99a71305f9c371441e1a33c8f25925..538fbf76354d13d5b7f6478a82dd40e2daf67add 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -255,7 +255,7 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) { - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = 1; @@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) return PTR_ERR(ioas); mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->enforce_cache_coherency) { + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->enforce_cache_coherency) { rc = 0; break; } diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 10b964600948c7f618c9fc6f2f6911b81ef15408..d30e453d0fb4b74a7eb47c77a86b3f11773c0742 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -11,6 +11,7 @@ #include #include #include +#include /* The anchor node sits above the top of the usable address space */ #define IOVA_ANCHOR ~0UL @@ -622,15 +623,21 @@ EXPORT_SYMBOL_GPL(reserve_iova); /* * As kmalloc's buffer size is fixed to power of 2, 127 is chosen to * assure size of 'iova_magazine' to be 1024 bytes, so that no memory - * will be wasted. + * will be wasted. Since only full magazines are inserted into the depot, + * we don't need to waste PFN capacity on a separate list head either. */ #define IOVA_MAG_SIZE 127 -#define MAX_GLOBAL_MAGS 32 /* magazines per bin */ + +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100) struct iova_magazine { - unsigned long size; + union { + unsigned long size; + struct iova_magazine *next; + }; unsigned long pfns[IOVA_MAG_SIZE]; }; +static_assert(!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1))); struct iova_cpu_rcache { spinlock_t lock; @@ -640,9 +647,11 @@ struct iova_cpu_rcache { struct iova_rcache { spinlock_t lock; - unsigned long depot_size; - struct iova_magazine *depot[MAX_GLOBAL_MAGS]; + unsigned int depot_size; + struct iova_magazine *depot; struct iova_cpu_rcache __percpu *cpu_rcaches; + struct iova_domain *iovad; + struct delayed_work work; }; static struct iova_magazine *iova_magazine_alloc(gfp_t flags) @@ -717,6 +726,41 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn) mag->pfns[mag->size++] = pfn; } +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache) +{ + struct iova_magazine *mag = rcache->depot; + + rcache->depot = mag->next; + mag->size = IOVA_MAG_SIZE; + rcache->depot_size--; + return mag; +} + +static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag) +{ + mag->next = rcache->depot; + rcache->depot = mag; + rcache->depot_size++; +} + +static void iova_depot_work_func(struct work_struct *work) +{ + struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work); + struct iova_magazine *mag = NULL; + unsigned long flags; + + spin_lock_irqsave(&rcache->lock, flags); + if (rcache->depot_size > num_online_cpus()) + mag = iova_depot_pop(rcache); + spin_unlock_irqrestore(&rcache->lock, flags); + + if (mag) { + iova_magazine_free_pfns(mag, rcache->iovad); + iova_magazine_free(mag); + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY); + } +} + int iova_domain_init_rcaches(struct iova_domain *iovad) { unsigned int cpu; @@ -734,7 +778,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad) rcache = &iovad->rcaches[i]; spin_lock_init(&rcache->lock); - rcache->depot_size = 0; + rcache->iovad = iovad; + INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func); rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size()); if (!rcache->cpu_rcaches) { @@ -776,7 +821,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, struct iova_rcache *rcache, unsigned long iova_pfn) { - struct iova_magazine *mag_to_free = NULL; struct iova_cpu_rcache *cpu_rcache; bool can_insert = false; unsigned long flags; @@ -794,13 +838,9 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, if (new_mag) { spin_lock(&rcache->lock); - if (rcache->depot_size < MAX_GLOBAL_MAGS) { - rcache->depot[rcache->depot_size++] = - cpu_rcache->loaded; - } else { - mag_to_free = cpu_rcache->loaded; - } + iova_depot_push(rcache, cpu_rcache->loaded); spin_unlock(&rcache->lock); + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY); cpu_rcache->loaded = new_mag; can_insert = true; @@ -812,11 +852,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, spin_unlock_irqrestore(&cpu_rcache->lock, flags); - if (mag_to_free) { - iova_magazine_free_pfns(mag_to_free, iovad); - iova_magazine_free(mag_to_free); - } - return can_insert; } @@ -854,9 +889,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, has_pfn = true; } else { spin_lock(&rcache->lock); - if (rcache->depot_size > 0) { + if (rcache->depot) { iova_magazine_free(cpu_rcache->loaded); - cpu_rcache->loaded = rcache->depot[--rcache->depot_size]; + cpu_rcache->loaded = iova_depot_pop(rcache); has_pfn = true; } spin_unlock(&rcache->lock); @@ -895,9 +930,8 @@ static void free_iova_rcaches(struct iova_domain *iovad) struct iova_rcache *rcache; struct iova_cpu_rcache *cpu_rcache; unsigned int cpu; - int i, j; - for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { + for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; if (!rcache->cpu_rcaches) break; @@ -907,8 +941,9 @@ static void free_iova_rcaches(struct iova_domain *iovad) iova_magazine_free(cpu_rcache->prev); } free_percpu(rcache->cpu_rcaches); - for (j = 0; j < rcache->depot_size; ++j) - iova_magazine_free(rcache->depot[j]); + cancel_delayed_work_sync(&rcache->work); + while (rcache->depot) + iova_magazine_free(iova_depot_pop(rcache)); } kfree(iovad->rcaches); @@ -942,16 +977,16 @@ static void free_global_cached_iovas(struct iova_domain *iovad) { struct iova_rcache *rcache; unsigned long flags; - int i, j; - for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { + for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; spin_lock_irqsave(&rcache->lock, flags); - for (j = 0; j < rcache->depot_size; ++j) { - iova_magazine_free_pfns(rcache->depot[j], iovad); - iova_magazine_free(rcache->depot[j]); + while (rcache->depot) { + struct iova_magazine *mag = iova_depot_pop(rcache); + + iova_magazine_free_pfns(mag, iovad); + iova_magazine_free(mag); } - rcache->depot_size = 0; spin_unlock_irqrestore(&rcache->lock, flags); } } diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 929a80044b5d431add468b5c32f5bd25ad385e3b..cd7219319c8b0f2cbae0bca0958989e48cf5c3b4 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -64,7 +64,6 @@ struct ipmmu_vmsa_device { struct ipmmu_vmsa_domain *domains[IPMMU_CTX_MAX]; s8 utlb_ctx[IPMMU_UTLB_MAX]; - struct iommu_group *group; struct dma_iommu_mapping *mapping; }; @@ -872,29 +871,18 @@ static void ipmmu_release_device(struct device *dev) arm_iommu_release_mapping(mmu->mapping); } -static struct iommu_group *ipmmu_find_group(struct device *dev) -{ - struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); - struct iommu_group *group; - - if (mmu->group) - return iommu_group_ref_get(mmu->group); - - group = iommu_group_alloc(); - if (!IS_ERR(group)) - mmu->group = group; - - return group; -} - static const struct iommu_ops ipmmu_ops = { .identity_domain = &ipmmu_iommu_identity_domain, .domain_alloc_paging = ipmmu_domain_alloc_paging, .probe_device = ipmmu_probe_device, .release_device = ipmmu_release_device, .probe_finalize = ipmmu_probe_finalize, + /* + * FIXME: The device grouping is a fixed property of the hardware's + * ability to isolate and control DMA, it should not depend on kconfig. + */ .device_group = IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA) - ? generic_device_group : ipmmu_find_group, + ? generic_device_group : generic_single_device_group, .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, .of_xlate = ipmmu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index f768dc9ada43fc742a8a348e20d082f0ba636e0d..989e0869d80554ba9b462fac95bcf39b600a1b7a 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -498,12 +498,13 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, return ret; } -static void msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, - size_t size) +static int msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size) { struct msm_priv *priv = to_msm_priv(domain); __flush_iotlb_range(iova, size, SZ_4K, false, priv); + return 0; } static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index b22a2f39c70194b2fe41dc127ae2296ae286a58d..add2eaaaa7ee18d79d8740292c72e5540f755db9 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -836,12 +836,13 @@ static void mtk_iommu_iotlb_sync(struct iommu_domain *domain, mtk_iommu_tlb_flush_range_sync(gather->start, length, dom->bank); } -static void mtk_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, - size_t size) +static int mtk_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); mtk_iommu_tlb_flush_range_sync(iova, size, dom->bank); + return 0; } static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain, @@ -862,16 +863,11 @@ static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain, static struct iommu_device *mtk_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct mtk_iommu_data *data; + struct mtk_iommu_data *data = dev_iommu_priv_get(dev); struct device_link *link; struct device *larbdev; unsigned int larbid, larbidx, i; - if (!fwspec || fwspec->ops != &mtk_iommu_ops) - return ERR_PTR(-ENODEV); /* Not a iommu client device */ - - data = dev_iommu_priv_get(dev); - if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) return &data->iommu; diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 8711b00b6c99b412792d2eaa7b6e0edf2a265cee..0ddcd153b568da72c6ad5e79119f20d2c782e3e1 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -482,9 +482,6 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } - if (!fwspec || fwspec->ops != &mtk_iommu_v1_ops) - return ERR_PTR(-ENODEV); /* Not a iommu client device */ - data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 50b567e420282d5bf9545a79fa4ffde8ed561faa..c9528065a59afac738a6f06ba89ef11c90082a72 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1225,18 +1225,15 @@ static int omap_iommu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, obj); if (omap_iommu_can_register(pdev)) { - obj->group = iommu_group_alloc(); - if (IS_ERR(obj->group)) - return PTR_ERR(obj->group); - err = iommu_device_sysfs_add(&obj->iommu, obj->dev, NULL, obj->name); if (err) - goto out_group; + return err; err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev); if (err) goto out_sysfs; + obj->has_iommu_driver = true; } pm_runtime_enable(obj->dev); @@ -1252,8 +1249,6 @@ static int omap_iommu_probe(struct platform_device *pdev) out_sysfs: iommu_device_sysfs_remove(&obj->iommu); -out_group: - iommu_group_put(obj->group); return err; } @@ -1261,10 +1256,7 @@ static void omap_iommu_remove(struct platform_device *pdev) { struct omap_iommu *obj = platform_get_drvdata(pdev); - if (obj->group) { - iommu_group_put(obj->group); - obj->group = NULL; - + if (obj->has_iommu_driver) { iommu_device_sysfs_remove(&obj->iommu); iommu_device_unregister(&obj->iommu); } @@ -1318,7 +1310,8 @@ static u32 iotlb_init_entry(struct iotlb_entry *e, u32 da, u32 pa, int pgsz) } static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, - phys_addr_t pa, size_t bytes, int prot, gfp_t gfp) + phys_addr_t pa, size_t bytes, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct omap_iommu_domain *omap_domain = to_omap_domain(domain); struct device *dev = omap_domain->dev; @@ -1356,13 +1349,15 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, oiommu = iommu->iommu_dev; iopgtable_clear_entry(oiommu, da); } + } else { + *mapped = bytes; } return ret; } static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct omap_iommu_domain *omap_domain = to_omap_domain(domain); struct device *dev = omap_domain->dev; @@ -1728,31 +1723,17 @@ static void omap_iommu_release_device(struct device *dev) } -static struct iommu_group *omap_iommu_device_group(struct device *dev) -{ - struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); - struct iommu_group *group = ERR_PTR(-EINVAL); - - if (!arch_data) - return ERR_PTR(-ENODEV); - - if (arch_data->iommu_dev) - group = iommu_group_ref_get(arch_data->iommu_dev->group); - - return group; -} - static const struct iommu_ops omap_iommu_ops = { .identity_domain = &omap_iommu_identity_domain, .domain_alloc_paging = omap_iommu_domain_alloc_paging, .probe_device = omap_iommu_probe_device, .release_device = omap_iommu_release_device, - .device_group = omap_iommu_device_group, + .device_group = generic_single_device_group, .pgsize_bitmap = OMAP_IOMMU_PGSIZES, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = omap_iommu_attach_dev, - .map = omap_iommu_map, - .unmap = omap_iommu_unmap, + .map_pages = omap_iommu_map, + .unmap_pages = omap_iommu_unmap, .iova_to_phys = omap_iommu_iova_to_phys, .free = omap_iommu_domain_free, } diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h index 18ee713ede784d16c884b9cca19ed8ce240706d4..27697109ec79a55953dd933302a27a76210a9858 100644 --- a/drivers/iommu/omap-iommu.h +++ b/drivers/iommu/omap-iommu.h @@ -80,7 +80,7 @@ struct omap_iommu { u32 id; struct iommu_device iommu; - struct iommu_group *group; + bool has_iommu_driver; u8 pwrst; }; diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h index 98daf0e1a3069089470360ce46938808d7ec14fb..3a67d4d664b06243e6acd257e01ab5f295b2bbde 100644 --- a/drivers/iommu/riscv/iommu-bits.h +++ b/drivers/iommu/riscv/iommu-bits.h @@ -706,79 +706,4 @@ struct riscv_iommu_msipte { #define RISCV_IOMMU_MSIPTE_MRIF_NPPN RISCV_IOMMU_PPN_FIELD #define RISCV_IOMMU_MSIPTE_MRIF_NID_MSB BIT_ULL(60) -/* Helper functions: command structure builders. */ - -static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd) -{ - cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | - FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA); - cmd->dword1 = 0; -} - -static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd, - u64 addr) -{ - cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr)); - cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV; -} - -static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd, - int pscid) -{ - cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) | - RISCV_IOMMU_CMD_IOTINVAL_PSCV; -} - -static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd, - int gscid) -{ - cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) | - RISCV_IOMMU_CMD_IOTINVAL_GV; -} - -static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd) -{ - cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | - FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | - RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW; - cmd->dword1 = 0; -} - -static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd, - u64 addr, u32 data) -{ - cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | - FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | - FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | - RISCV_IOMMU_CMD_IOFENCE_AV; - cmd->dword1 = addr >> 2; -} - -static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd) -{ - cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | - FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT); - cmd->dword1 = 0; -} - -static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd) -{ - cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | - FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT); - cmd->dword1 = 0; -} - -static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd, - unsigned int devid) -{ - cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | - RISCV_IOMMU_CMD_IODIR_DV; -} - -static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd, - unsigned int pasid) -{ - cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid); -} - #endif /* _RISCV_IOMMU_BITS_H_ */ diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 8a05def774bdb5666091ab8d469bf1783d3da6f8..c54088bf138f46f728621b5fd5fe2695211af4f1 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -16,630 +16,14 @@ #include #include #include -#include #include #include -#include "../iommu-pages.h" #include "iommu-bits.h" #include "iommu.h" /* Timeouts in [us] */ -#define RISCV_IOMMU_QCSR_TIMEOUT 150000 -#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 -#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 -#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 - -/* Number of entries per CMD/FLT queue, should be <= INT_MAX */ -#define RISCV_IOMMU_DEF_CQ_COUNT 8192 -#define RISCV_IOMMU_DEF_FQ_COUNT 4096 - -/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ -#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) -#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) - -#define dev_to_iommu(dev) \ - iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) - -/* IOMMU PSCID allocation namespace. */ -static DEFINE_IDA(riscv_iommu_pscids); -#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) - -/* Device resource-managed allocations */ -struct riscv_iommu_devres { - void *addr; - int order; -}; - -static void riscv_iommu_devres_pages_release(struct device *dev, void *res) -{ - struct riscv_iommu_devres *devres = res; - - iommu_free_pages(devres->addr, devres->order); -} - -static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) -{ - struct riscv_iommu_devres *devres = res; - struct riscv_iommu_devres *target = p; - - return devres->addr == target->addr; -} - -static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) -{ - struct riscv_iommu_devres *devres; - void *addr; - - addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), - GFP_KERNEL_ACCOUNT, order); - if (unlikely(!addr)) - return NULL; - - devres = devres_alloc(riscv_iommu_devres_pages_release, - sizeof(struct riscv_iommu_devres), GFP_KERNEL); - - if (unlikely(!devres)) { - iommu_free_pages(addr, order); - return NULL; - } - - devres->addr = addr; - devres->order = order; - - devres_add(iommu->dev, devres); - - return addr; -} - -static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) -{ - struct riscv_iommu_devres devres = { .addr = addr }; - - devres_release(iommu->dev, riscv_iommu_devres_pages_release, - riscv_iommu_devres_pages_match, &devres); -} - -/* - * Hardware queue allocation and management. - */ - -/* Setup queue base, control registers and default queue length */ -#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ - struct riscv_iommu_queue *_q = q; \ - _q->qid = RISCV_IOMMU_INTR_ ## name; \ - _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ - _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ - _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ -} while (0) - -/* Note: offsets are the same for all queues */ -#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) -#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) -#define Q_ITEM(q, index) ((q)->mask & (index)) -#define Q_IPSR(q) BIT((q)->qid) - -/* - * Discover queue ring buffer hardware configuration, allocate in-memory - * ring buffer or use fixed I/O memory location, configure queue base register. - * Must be called before hardware queue is enabled. - * - * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() - * @entry_size - queue single element size in bytes. - */ -static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, - struct riscv_iommu_queue *queue, - size_t entry_size) -{ - unsigned int logsz; - u64 qb, rb; - - /* - * Use WARL base register property to discover maximum allowed - * number of entries and optional fixed IO address for queue location. - */ - riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); - qb = riscv_iommu_readq(iommu, queue->qbr); - - /* - * Calculate and verify hardware supported queue length, as reported - * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). - * Update queue size based on hardware supported value. - */ - logsz = ilog2(queue->mask); - if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) - logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); - - /* - * Use WARL base register property to discover an optional fixed IO - * address for queue ring buffer location. Otherwise allocate contiguous - * system memory. - */ - if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { - const size_t queue_size = entry_size << (logsz + 1); - - queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); - queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); - } else { - do { - const size_t queue_size = entry_size << (logsz + 1); - const int order = get_order(queue_size); - - queue->base = riscv_iommu_get_pages(iommu, order); - queue->phys = __pa(queue->base); - } while (!queue->base && logsz-- > 0); - } - - if (!queue->base) - return -ENOMEM; - - qb = phys_to_ppn(queue->phys) | - FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); - - /* Update base register and read back to verify hw accepted our write */ - riscv_iommu_writeq(iommu, queue->qbr, qb); - rb = riscv_iommu_readq(iommu, queue->qbr); - if (rb != qb) { - dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); - return -ENODEV; - } - - /* Update actual queue mask */ - queue->mask = (2U << logsz) - 1; - - dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", - queue->qid, logsz + 1); - - return 0; -} - -/* Check interrupt queue status, IPSR */ -static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) -{ - struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; - - if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) - return IRQ_WAKE_THREAD; - - return IRQ_NONE; -} - -static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) -{ - /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ - return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; -} - -/* - * Enable queue processing in the hardware, register interrupt handler. - * - * @queue - data structure, already allocated with riscv_iommu_queue_alloc() - * @irq_handler - threaded interrupt handler. - */ -static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, - struct riscv_iommu_queue *queue, - irq_handler_t irq_handler) -{ - const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; - u32 csr; - int rc; - - if (queue->iommu) - return -EBUSY; - - /* Polling not implemented */ - if (!irq) - return -ENODEV; - - queue->iommu = iommu; - rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, - IRQF_ONESHOT | IRQF_SHARED, - dev_name(iommu->dev), queue); - if (rc) { - queue->iommu = NULL; - return rc; - } - - /* - * Enable queue with interrupts, clear any memory fault if any. - * Wait for the hardware to acknowledge request and activate queue - * processing. - * Note: All CSR bitfields are in the same offsets for all queues. - */ - riscv_iommu_writel(iommu, queue->qcr, - RISCV_IOMMU_QUEUE_ENABLE | - RISCV_IOMMU_QUEUE_INTR_ENABLE | - RISCV_IOMMU_QUEUE_MEM_FAULT); - - riscv_iommu_readl_timeout(iommu, queue->qcr, - csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), - 10, RISCV_IOMMU_QCSR_TIMEOUT); - - if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | - RISCV_IOMMU_QUEUE_BUSY | - RISCV_IOMMU_QUEUE_MEM_FAULT))) { - /* Best effort to stop and disable failing hardware queue. */ - riscv_iommu_writel(iommu, queue->qcr, 0); - free_irq(irq, queue); - queue->iommu = NULL; - dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); - return -EBUSY; - } - - /* Clear any pending interrupt flag. */ - riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); - - return 0; -} - -/* - * Disable queue. Wait for the hardware to acknowledge request and - * stop processing enqueued requests. Report errors but continue. - */ -static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) -{ - struct riscv_iommu_device *iommu = queue->iommu; - u32 csr; - - if (!iommu) - return; - - free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); - riscv_iommu_writel(iommu, queue->qcr, 0); - riscv_iommu_readl_timeout(iommu, queue->qcr, - csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), - 10, RISCV_IOMMU_QCSR_TIMEOUT); - - if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) - dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", - queue->qid, csr); - - queue->iommu = NULL; -} - -/* - * Returns number of available valid queue entries and the first item index. - * Update shadow producer index if necessary. - */ -static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, - unsigned int *index) -{ - unsigned int head = atomic_read(&queue->head); - unsigned int tail = atomic_read(&queue->tail); - unsigned int last = Q_ITEM(queue, tail); - int available = (int)(tail - head); - - *index = head; - - if (available > 0) - return available; - - /* read hardware producer index, check reserved register bits are not set. */ - if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), - tail, (tail & ~queue->mask) == 0, - 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { - dev_err_once(queue->iommu->dev, - "Hardware error: queue access timeout\n"); - return 0; - } - - if (tail == last) - return 0; - - /* update shadow producer index */ - return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); -} - -/* - * Release processed queue entries, should match riscv_iommu_queue_consume() calls. - */ -static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) -{ - const unsigned int head = atomic_add_return(count, &queue->head); - - riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); -} - -/* Return actual consumer index based on hardware reported queue head index. */ -static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) -{ - const unsigned int cons = atomic_read(&queue->head); - const unsigned int last = Q_ITEM(queue, cons); - unsigned int head; - - if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, - !(head & ~queue->mask), - 0, RISCV_IOMMU_QUEUE_TIMEOUT)) - return cons; - - return cons + ((head - last) & queue->mask); -} - -/* Wait for submitted item to be processed. */ -static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, - unsigned int index, - unsigned int timeout_us) -{ - unsigned int cons = atomic_read(&queue->head); - - /* Already processed by the consumer */ - if ((int)(cons - index) > 0) - return 0; - - /* Monitor consumer index */ - return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, - (int)(cons - index) > 0, 0, timeout_us); -} - -/* Enqueue an entry and wait to be processed if timeout_us > 0 - * - * Error handling for IOMMU hardware not responding in reasonable time - * will be added as separate patch series along with other RAS features. - * For now, only report hardware failure and continue. - */ -static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, - void *entry, size_t entry_size) -{ - unsigned int prod; - unsigned int head; - unsigned int tail; - unsigned long flags; - - /* Do not preempt submission flow. */ - local_irq_save(flags); - - /* 1. Allocate some space in the queue */ - prod = atomic_inc_return(&queue->prod) - 1; - head = atomic_read(&queue->head); - - /* 2. Wait for space availability. */ - if ((prod - head) > queue->mask) { - if (readx_poll_timeout(atomic_read, &queue->head, - head, (prod - head) < queue->mask, - 0, RISCV_IOMMU_QUEUE_TIMEOUT)) - goto err_busy; - } else if ((prod - head) == queue->mask) { - const unsigned int last = Q_ITEM(queue, head); - - if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, - !(head & ~queue->mask) && head != last, - 0, RISCV_IOMMU_QUEUE_TIMEOUT)) - goto err_busy; - atomic_add((head - last) & queue->mask, &queue->head); - } - - /* 3. Store entry in the ring buffer */ - memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); - - /* 4. Wait for all previous entries to be ready */ - if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, - 0, RISCV_IOMMU_QUEUE_TIMEOUT)) - goto err_busy; - - /* - * 5. Make sure the ring buffer update (whether in normal or I/O memory) is - * completed and visible before signaling the tail doorbell to fetch - * the next command. 'fence ow, ow' - */ - dma_wmb(); - riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); - - /* - * 6. Make sure the doorbell write to the device has finished before updating - * the shadow tail index in normal memory. 'fence o, w' - */ - mmiowb(); - atomic_inc(&queue->tail); - - /* 7. Complete submission and restore local interrupts */ - local_irq_restore(flags); - - return prod; - -err_busy: - local_irq_restore(flags); - dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); - - return prod; -} - -/* - * IOMMU Command queue chapter 3.1 - */ - -/* Command queue interrupt handler thread function */ -static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) -{ - const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; - unsigned int ctrl; - - /* Clear MF/CQ errors, complete error recovery to be implemented. */ - ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); - if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | - RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { - riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); - dev_warn(queue->iommu->dev, - "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", - queue->qid, - !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), - !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), - !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), - !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); - } - - /* Placeholder for command queue interrupt notifiers */ - - /* Clear command interrupt pending. */ - riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); - - return IRQ_HANDLED; -} - -/* Send command to the IOMMU command queue */ -static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, - struct riscv_iommu_command *cmd) -{ - riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); -} - -/* Send IOFENCE.C command and wait for all scheduled commands to complete. */ -static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, - unsigned int timeout_us) -{ - struct riscv_iommu_command cmd; - unsigned int prod; - - riscv_iommu_cmd_iofence(&cmd); - prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); - - if (!timeout_us) - return; - - if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) - dev_err_once(iommu->dev, - "Hardware error: command execution timeout\n"); -} - -/* - * IOMMU Fault/Event queue chapter 3.2 - */ - -static void riscv_iommu_fault(struct riscv_iommu_device *iommu, - struct riscv_iommu_fq_record *event) -{ - unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); - unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); - - /* Placeholder for future fault handling implementation, report only. */ - if (err) - dev_warn_ratelimited(iommu->dev, - "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", - err, devid, event->iotval, event->iotval2); -} - -/* Fault queue interrupt handler thread function */ -static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) -{ - struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; - struct riscv_iommu_device *iommu = queue->iommu; - struct riscv_iommu_fq_record *events; - unsigned int ctrl, idx; - int cnt, len; - - events = (struct riscv_iommu_fq_record *)queue->base; - - /* Clear fault interrupt pending and process all received fault events. */ - riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); - - do { - cnt = riscv_iommu_queue_consume(queue, &idx); - for (len = 0; len < cnt; idx++, len++) - riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); - riscv_iommu_queue_release(queue, cnt); - } while (cnt > 0); - - /* Clear MF/OF errors, complete error recovery to be implemented. */ - ctrl = riscv_iommu_readl(iommu, queue->qcr); - if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { - riscv_iommu_writel(iommu, queue->qcr, ctrl); - dev_warn(iommu->dev, - "Queue #%u error; memory fault:%d overflow:%d\n", - queue->qid, - !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), - !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); - } - - return IRQ_HANDLED; -} - -/* Lookup and initialize device context info structure. */ -static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, - unsigned int devid) -{ - const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); - unsigned int depth; - unsigned long ddt, old, new; - void *ptr; - u8 ddi_bits[3] = { 0 }; - u64 *ddtp = NULL; - - /* Make sure the mode is valid */ - if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || - iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) - return NULL; - - /* - * Device id partitioning for base format: - * DDI[0]: bits 0 - 6 (1st level) (7 bits) - * DDI[1]: bits 7 - 15 (2nd level) (9 bits) - * DDI[2]: bits 16 - 23 (3rd level) (8 bits) - * - * For extended format: - * DDI[0]: bits 0 - 5 (1st level) (6 bits) - * DDI[1]: bits 6 - 14 (2nd level) (9 bits) - * DDI[2]: bits 15 - 23 (3rd level) (9 bits) - */ - if (base_format) { - ddi_bits[0] = 7; - ddi_bits[1] = 7 + 9; - ddi_bits[2] = 7 + 9 + 8; - } else { - ddi_bits[0] = 6; - ddi_bits[1] = 6 + 9; - ddi_bits[2] = 6 + 9 + 9; - } - - /* Make sure device id is within range */ - depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; - if (devid >= (1 << ddi_bits[depth])) - return NULL; - - /* Get to the level of the non-leaf node that holds the device context */ - for (ddtp = iommu->ddt_root; depth-- > 0;) { - const int split = ddi_bits[depth]; - /* - * Each non-leaf node is 64bits wide and on each level - * nodes are indexed by DDI[depth]. - */ - ddtp += (devid >> split) & 0x1FF; - - /* - * Check if this node has been populated and if not - * allocate a new level and populate it. - */ - do { - ddt = READ_ONCE(*(unsigned long *)ddtp); - if (ddt & RISCV_IOMMU_DDTE_V) { - ddtp = __va(ppn_to_phys(ddt)); - break; - } - - ptr = riscv_iommu_get_pages(iommu, 0); - if (!ptr) - return NULL; - - new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; - old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); - - if (old == ddt) { - ddtp = (u64 *)ptr; - break; - } - - /* Race setting DDT detected, re-read and retry. */ - riscv_iommu_free_pages(iommu, ptr); - } while (1); - } - - /* - * Grab the node that matches DDI[depth], note that when using base - * format the device context is 4 * 64bits, and the extended format - * is 8 * 64bits, hence the (3 - base_format) below. - */ - ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); - - return (struct riscv_iommu_dc *)ddtp; -} +#define RISCV_IOMMU_DDTP_TIMEOUT 50000 /* * This is best effort IOMMU translation shutdown flow. @@ -653,798 +37,10 @@ static void riscv_iommu_disable(struct riscv_iommu_device *iommu) riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); } -#define riscv_iommu_read_ddtp(iommu) ({ \ - u64 ddtp; \ - riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ - !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ - RISCV_IOMMU_DDTP_TIMEOUT); \ - ddtp; }) - -static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) -{ - u64 ddtp; - unsigned int mode; - - ddtp = riscv_iommu_read_ddtp(iommu); - if (ddtp & RISCV_IOMMU_DDTP_BUSY) - return -EBUSY; - - /* - * It is optional for the hardware to report a fixed address for device - * directory root page when DDT.MODE is OFF or BARE. - */ - mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); - if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || - mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { - /* Use WARL to discover hardware fixed DDT PPN */ - riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, - FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); - ddtp = riscv_iommu_read_ddtp(iommu); - if (ddtp & RISCV_IOMMU_DDTP_BUSY) - return -EBUSY; - - iommu->ddt_phys = ppn_to_phys(ddtp); - if (iommu->ddt_phys) - iommu->ddt_root = devm_ioremap(iommu->dev, - iommu->ddt_phys, PAGE_SIZE); - if (iommu->ddt_root) - memset(iommu->ddt_root, 0, PAGE_SIZE); - } - - if (!iommu->ddt_root) { - iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); - iommu->ddt_phys = __pa(iommu->ddt_root); - } - - if (!iommu->ddt_root) - return -ENOMEM; - - return 0; -} - -/* - * Discover supported DDT modes starting from requested value, - * configure DDTP register with accepted mode and root DDT address. - * Accepted iommu->ddt_mode is updated on success. - */ -static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, - unsigned int ddtp_mode) -{ - struct device *dev = iommu->dev; - u64 ddtp, rq_ddtp; - unsigned int mode, rq_mode = ddtp_mode; - struct riscv_iommu_command cmd; - - ddtp = riscv_iommu_read_ddtp(iommu); - if (ddtp & RISCV_IOMMU_DDTP_BUSY) - return -EBUSY; - - /* Disallow state transition from xLVL to xLVL. */ - mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); - if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && - mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && - rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && - rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) - return -EINVAL; - - do { - rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); - if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) - rq_ddtp |= phys_to_ppn(iommu->ddt_phys); - - riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); - ddtp = riscv_iommu_read_ddtp(iommu); - if (ddtp & RISCV_IOMMU_DDTP_BUSY) { - dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", - rq_mode, ddtp); - return -EBUSY; - } - - /* Verify IOMMU hardware accepts new DDTP config. */ - mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); - - if (rq_mode == mode) - break; - - /* Hardware mandatory DDTP mode has not been accepted. */ - if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { - dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", - ddtp, rq_ddtp); - return -EINVAL; - } - - /* - * Mode field is WARL, an IOMMU may support a subset of - * directory table levels in which case if we tried to set - * an unsupported number of levels we'll readback either - * a valid xLVL or off/bare. If we got off/bare, try again - * with a smaller xLVL. - */ - if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && - rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { - dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); - rq_mode--; - continue; - } - - /* - * We tried all supported modes and IOMMU hardware failed to - * accept new settings, something went very wrong since off/bare - * and at least one xLVL must be supported. - */ - dev_err(dev, "DDTP hw mode %u, failed to set %u\n", - mode, ddtp_mode); - return -EINVAL; - } while (1); - - iommu->ddt_mode = mode; - if (mode != ddtp_mode) - dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); - - /* Invalidate device context cache */ - riscv_iommu_cmd_iodir_inval_ddt(&cmd); - riscv_iommu_cmd_send(iommu, &cmd); - - /* Invalidate address translation cache */ - riscv_iommu_cmd_inval_vma(&cmd); - riscv_iommu_cmd_send(iommu, &cmd); - - /* IOFENCE.C */ - riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); - - return 0; -} - -/* This struct contains protection domain specific IOMMU driver data. */ -struct riscv_iommu_domain { - struct iommu_domain domain; - struct list_head bonds; - spinlock_t lock; /* protect bonds list updates. */ - int pscid; - bool amo_enabled; - int numa_node; - unsigned int pgd_mode; - unsigned long *pgd_root; -}; - -#define iommu_domain_to_riscv(iommu_domain) \ - container_of(iommu_domain, struct riscv_iommu_domain, domain) - -/* Private IOMMU data for managed devices, dev_iommu_priv_* */ -struct riscv_iommu_info { - struct riscv_iommu_domain *domain; -}; - -/* - * Linkage between an iommu_domain and attached devices. - * - * Protection domain requiring IOATC and DevATC translation cache invalidations, - * should be linked to attached devices using a riscv_iommu_bond structure. - * Devices should be linked to the domain before first use and unlinked after - * the translations from the referenced protection domain can no longer be used. - * Blocking and identity domains are not tracked here, as the IOMMU hardware - * does not cache negative and/or identity (BARE mode) translations, and DevATC - * is disabled for those protection domains. - * - * The device pointer and IOMMU data remain stable in the bond struct after - * _probe_device() where it's attached to the managed IOMMU, up to the - * completion of the _release_device() call. The release of the bond structure - * is synchronized with the device release. - */ -struct riscv_iommu_bond { - struct list_head list; - struct rcu_head rcu; - struct device *dev; -}; - -static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, - struct device *dev) -{ - struct riscv_iommu_device *iommu = dev_to_iommu(dev); - struct riscv_iommu_bond *bond; - struct list_head *bonds; - - bond = kzalloc(sizeof(*bond), GFP_KERNEL); - if (!bond) - return -ENOMEM; - bond->dev = dev; - - /* - * List of devices attached to the domain is arranged based on - * managed IOMMU device. - */ - - spin_lock(&domain->lock); - list_for_each(bonds, &domain->bonds) - if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) - break; - list_add_rcu(&bond->list, bonds); - spin_unlock(&domain->lock); - - /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ - smp_mb(); - - return 0; -} - -static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, - struct device *dev) -{ - struct riscv_iommu_device *iommu = dev_to_iommu(dev); - struct riscv_iommu_bond *bond, *found = NULL; - struct riscv_iommu_command cmd; - int count = 0; - - if (!domain) - return; - - spin_lock(&domain->lock); - list_for_each_entry(bond, &domain->bonds, list) { - if (found && count) - break; - else if (bond->dev == dev) - found = bond; - else if (dev_to_iommu(bond->dev) == iommu) - count++; - } - if (found) - list_del_rcu(&found->list); - spin_unlock(&domain->lock); - kfree_rcu(found, rcu); - - /* - * If this was the last bond between this domain and the IOMMU - * invalidate all cached entries for domain's PSCID. - */ - if (!count) { - riscv_iommu_cmd_inval_vma(&cmd); - riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); - riscv_iommu_cmd_send(iommu, &cmd); - - riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); - } -} - -/* - * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. - * This limit will be replaced with range invalidations, if supported by - * the hardware, when RISC-V IOMMU architecture specification update for - * range invalidations update will be available. - */ -#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) - -static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, - unsigned long start, unsigned long end) -{ - struct riscv_iommu_bond *bond; - struct riscv_iommu_device *iommu, *prev; - struct riscv_iommu_command cmd; - unsigned long len = end - start + 1; - unsigned long iova; - - /* - * For each IOMMU linked with this protection domain (via bonds->dev), - * an IOTLB invaliation command will be submitted and executed. - * - * Possbile race with domain attach flow is handled by sequencing - * bond creation - riscv_iommu_bond_link(), and device directory - * update - riscv_iommu_iodir_update(). - * - * PTE Update / IOTLB Inval Device attach & directory update - * -------------------------- -------------------------- - * update page table entries add dev to the bond list - * FENCE RW,RW FENCE RW,RW - * For all IOMMUs: (can be empty) Update FSC/PSCID - * FENCE IOW,IOW FENCE IOW,IOW - * IOTLB.INVAL IODIR.INVAL - * IOFENCE.C - * - * If bond list is not updated with new device, directory context will - * be configured with already valid page table content. If an IOMMU is - * linked to the protection domain it will receive invalidation - * requests for updated page table entries. - */ - smp_mb(); - - rcu_read_lock(); - - prev = NULL; - list_for_each_entry_rcu(bond, &domain->bonds, list) { - iommu = dev_to_iommu(bond->dev); - - /* - * IOTLB invalidation request can be safely omitted if already sent - * to the IOMMU for the same PSCID, and with domain->bonds list - * arranged based on the device's IOMMU, it's sufficient to check - * last device the invalidation was sent to. - */ - if (iommu == prev) - continue; - - riscv_iommu_cmd_inval_vma(&cmd); - riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); - if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { - for (iova = start; iova < end; iova += PAGE_SIZE) { - riscv_iommu_cmd_inval_set_addr(&cmd, iova); - riscv_iommu_cmd_send(iommu, &cmd); - } - } else { - riscv_iommu_cmd_send(iommu, &cmd); - } - prev = iommu; - } - - prev = NULL; - list_for_each_entry_rcu(bond, &domain->bonds, list) { - iommu = dev_to_iommu(bond->dev); - if (iommu == prev) - continue; - - riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); - prev = iommu; - } - rcu_read_unlock(); -} - -#define RISCV_IOMMU_FSC_BARE 0 - -/* - * Update IODIR for the device. - * - * During the execution of riscv_iommu_probe_device(), IODIR entries are - * allocated for the device's identifiers. Device context invalidation - * becomes necessary only if one of the updated entries was previously - * marked as valid, given that invalid device context entries are not - * cached by the IOMMU hardware. - * In this implementation, updating a valid device context while the - * device is not quiesced might be disruptive, potentially causing - * interim translation faults. - */ -static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, - struct device *dev, u64 fsc, u64 ta) -{ - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct riscv_iommu_dc *dc; - struct riscv_iommu_command cmd; - bool sync_required = false; - u64 tc; - int i; - - for (i = 0; i < fwspec->num_ids; i++) { - dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); - tc = READ_ONCE(dc->tc); - if (!(tc & RISCV_IOMMU_DC_TC_V)) - continue; - - WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); - - /* Invalidate device context cached values */ - riscv_iommu_cmd_iodir_inval_ddt(&cmd); - riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); - riscv_iommu_cmd_send(iommu, &cmd); - sync_required = true; - } - - if (sync_required) - riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); - - /* - * For device context with DC_TC_PDTV = 0, translation attributes valid bit - * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). - */ - for (i = 0; i < fwspec->num_ids; i++) { - dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); - tc = READ_ONCE(dc->tc); - tc |= ta & RISCV_IOMMU_DC_TC_V; - - WRITE_ONCE(dc->fsc, fsc); - WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); - /* Update device context, write TC.V as the last step. */ - dma_wmb(); - WRITE_ONCE(dc->tc, tc); - - /* Invalidate device context after update */ - riscv_iommu_cmd_iodir_inval_ddt(&cmd); - riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); - riscv_iommu_cmd_send(iommu, &cmd); - } - - riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); -} - -/* - * IOVA page translation tree management. - */ - -static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - - riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); -} - -static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, - struct iommu_iotlb_gather *gather) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - - riscv_iommu_iotlb_inval(domain, gather->start, gather->end); -} - -#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) - -#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) -#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) -#define _io_pte_none(pte) ((pte) == 0) -#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) - -static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, - unsigned long pte, struct list_head *freelist) -{ - unsigned long *ptr; - int i; - - if (!_io_pte_present(pte) || _io_pte_leaf(pte)) - return; - - ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); - - /* Recursively free all sub page table pages */ - for (i = 0; i < PTRS_PER_PTE; i++) { - pte = READ_ONCE(ptr[i]); - if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) - riscv_iommu_pte_free(domain, pte, freelist); - } - - if (freelist) - list_add_tail(&virt_to_page(ptr)->lru, freelist); - else - iommu_free_page(ptr); -} - -static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, - unsigned long iova, size_t pgsize, - gfp_t gfp) -{ - unsigned long *ptr = domain->pgd_root; - unsigned long pte, old; - int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; - void *addr; - - do { - const int shift = PAGE_SHIFT + PT_SHIFT * level; - - ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); - /* - * Note: returned entry might be a non-leaf if there was - * existing mapping with smaller granularity. Up to the caller - * to replace and invalidate. - */ - if (((size_t)1 << shift) == pgsize) - return ptr; -pte_retry: - pte = READ_ONCE(*ptr); - /* - * This is very likely incorrect as we should not be adding - * new mapping with smaller granularity on top - * of existing 2M/1G mapping. Fail. - */ - if (_io_pte_present(pte) && _io_pte_leaf(pte)) - return NULL; - /* - * Non-leaf entry is missing, allocate and try to add to the - * page table. This might race with other mappings, retry. - */ - if (_io_pte_none(pte)) { - addr = iommu_alloc_page_node(domain->numa_node, gfp); - if (!addr) - return NULL; - old = pte; - pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); - if (cmpxchg_relaxed(ptr, old, pte) != old) { - iommu_free_page(addr); - goto pte_retry; - } - } - ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); - } while (level-- > 0); - - return NULL; -} - -static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, - unsigned long iova, size_t *pte_pgsize) -{ - unsigned long *ptr = domain->pgd_root; - unsigned long pte; - int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; - - do { - const int shift = PAGE_SHIFT + PT_SHIFT * level; - - ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); - pte = READ_ONCE(*ptr); - if (_io_pte_present(pte) && _io_pte_leaf(pte)) { - *pte_pgsize = (size_t)1 << shift; - return ptr; - } - if (_io_pte_none(pte)) - return NULL; - ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); - } while (level-- > 0); - - return NULL; -} - -static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, - unsigned long iova, phys_addr_t phys, - size_t pgsize, size_t pgcount, int prot, - gfp_t gfp, size_t *mapped) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - size_t size = 0; - unsigned long *ptr; - unsigned long pte, old, pte_prot; - int rc = 0; - LIST_HEAD(freelist); - - if (!(prot & IOMMU_WRITE)) - pte_prot = _PAGE_BASE | _PAGE_READ; - else if (domain->amo_enabled) - pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; - else - pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; - - while (pgcount) { - ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); - if (!ptr) { - rc = -ENOMEM; - break; - } - - old = READ_ONCE(*ptr); - pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); - if (cmpxchg_relaxed(ptr, old, pte) != old) - continue; - - riscv_iommu_pte_free(domain, old, &freelist); - - size += pgsize; - iova += pgsize; - phys += pgsize; - --pgcount; - } - - *mapped = size; - - if (!list_empty(&freelist)) { - /* - * In 1.0 spec version, the smallest scope we can use to - * invalidate all levels of page table (i.e. leaf and non-leaf) - * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. - * This will be updated with hardware support for - * capability.NL (non-leaf) IOTINVAL command. - */ - riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); - iommu_put_pages_list(&freelist); - } - - return rc; -} - -static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, - unsigned long iova, size_t pgsize, - size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - size_t size = pgcount << __ffs(pgsize); - unsigned long *ptr, old; - size_t unmapped = 0; - size_t pte_size; - - while (unmapped < size) { - ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); - if (!ptr) - return unmapped; - - /* partial unmap is not allowed, fail. */ - if (iova & (pte_size - 1)) - return unmapped; - - old = READ_ONCE(*ptr); - if (cmpxchg_relaxed(ptr, old, 0) != old) - continue; - - iommu_iotlb_gather_add_page(&domain->domain, gather, iova, - pte_size); - - iova += pte_size; - unmapped += pte_size; - } - - return unmapped; -} - -static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, - dma_addr_t iova) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - unsigned long pte_size; - unsigned long *ptr; - - ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); - if (_io_pte_none(*ptr) || !_io_pte_present(*ptr)) - return 0; - - return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); -} - -static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - const unsigned long pfn = virt_to_pfn(domain->pgd_root); - - WARN_ON(!list_empty(&domain->bonds)); - - if ((int)domain->pscid > 0) - ida_free(&riscv_iommu_pscids, domain->pscid); - - riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); - kfree(domain); -} - -static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) -{ - switch (pgd_mode) { - case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: - return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; - - case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: - return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; - - case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: - return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; - } - return false; -} - -static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, - struct device *dev) -{ - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); - struct riscv_iommu_device *iommu = dev_to_iommu(dev); - struct riscv_iommu_info *info = dev_iommu_priv_get(dev); - u64 fsc, ta; - - if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) - return -ENODEV; - - fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | - FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); - ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | - RISCV_IOMMU_PC_TA_V; - - if (riscv_iommu_bond_link(domain, dev)) - return -ENOMEM; - - riscv_iommu_iodir_update(iommu, dev, fsc, ta); - riscv_iommu_bond_unlink(info->domain, dev); - info->domain = domain; - - return 0; -} - -static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { - .attach_dev = riscv_iommu_attach_paging_domain, - .free = riscv_iommu_free_paging_domain, - .map_pages = riscv_iommu_map_pages, - .unmap_pages = riscv_iommu_unmap_pages, - .iova_to_phys = riscv_iommu_iova_to_phys, - .iotlb_sync = riscv_iommu_iotlb_sync, - .flush_iotlb_all = riscv_iommu_iotlb_flush_all, -}; - -static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) -{ - struct riscv_iommu_domain *domain; - struct riscv_iommu_device *iommu; - unsigned int pgd_mode; - dma_addr_t va_mask; - int va_bits; - - iommu = dev_to_iommu(dev); - if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { - pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; - va_bits = 57; - } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { - pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; - va_bits = 48; - } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { - pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; - va_bits = 39; - } else { - dev_err(dev, "cannot find supported page table mode\n"); - return ERR_PTR(-ENODEV); - } - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD_RCU(&domain->bonds); - spin_lock_init(&domain->lock); - domain->numa_node = dev_to_node(iommu->dev); - domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); - domain->pgd_mode = pgd_mode; - domain->pgd_root = iommu_alloc_page_node(domain->numa_node, - GFP_KERNEL_ACCOUNT); - if (!domain->pgd_root) { - kfree(domain); - return ERR_PTR(-ENOMEM); - } - - domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, - RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); - if (domain->pscid < 0) { - iommu_free_page(domain->pgd_root); - kfree(domain); - return ERR_PTR(-ENOMEM); - } - - /* - * Note: RISC-V Privilege spec mandates that virtual addresses - * need to be sign-extended, so if (VA_BITS - 1) is set, all - * bits >= VA_BITS need to also be set or else we'll get a - * page fault. However the code that creates the mappings - * above us (e.g. iommu_dma_alloc_iova()) won't do that for us - * for now, so we'll end up with invalid virtual addresses - * to map. As a workaround until we get this sorted out - * limit the available virtual addresses to VA_BITS - 1. - */ - va_mask = DMA_BIT_MASK(va_bits - 1); - - domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = va_mask; - domain->domain.geometry.force_aperture = true; - domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); - - domain->domain.ops = &riscv_iommu_paging_domain_ops; - - return &domain->domain; -} - -static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, - struct device *dev) -{ - struct riscv_iommu_device *iommu = dev_to_iommu(dev); - struct riscv_iommu_info *info = dev_iommu_priv_get(dev); - - /* Make device context invalid, translation requests will fault w/ #258 */ - riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); - riscv_iommu_bond_unlink(info->domain, dev); - info->domain = NULL; - - return 0; -} - -static struct iommu_domain riscv_iommu_blocking_domain = { - .type = IOMMU_DOMAIN_BLOCKED, - .ops = &(const struct iommu_domain_ops) { - .attach_dev = riscv_iommu_attach_blocking_domain, - } -}; - static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, struct device *dev) { - struct riscv_iommu_device *iommu = dev_to_iommu(dev); - struct riscv_iommu_info *info = dev_iommu_priv_get(dev); - - riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); - riscv_iommu_bond_unlink(info->domain, dev); - info->domain = NULL; - + /* Global pass-through already enabled, do nothing for now. */ return 0; } @@ -1455,6 +51,11 @@ static struct iommu_domain riscv_iommu_identity_domain = { } }; +static int riscv_iommu_device_domain_type(struct device *dev) +{ + return IOMMU_DOMAIN_IDENTITY; +} + static struct iommu_group *riscv_iommu_device_group(struct device *dev) { if (dev_is_pci(dev)) @@ -1471,10 +72,6 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_device *iommu; - struct riscv_iommu_info *info; - struct riscv_iommu_dc *dc; - u64 tc; - int i; if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) return ERR_PTR(-ENODEV); @@ -1483,56 +80,15 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) if (!iommu) return ERR_PTR(-ENODEV); - /* - * IOMMU hardware operating in fail-over BARE mode will provide - * identity translation for all connected devices anyway... - */ - if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) - return ERR_PTR(-ENODEV); - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return ERR_PTR(-ENOMEM); - /* - * Allocate and pre-configure device context entries in - * the device directory. Do not mark the context valid yet. - */ - tc = 0; - if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) - tc |= RISCV_IOMMU_DC_TC_SADE; - for (i = 0; i < fwspec->num_ids; i++) { - dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); - if (!dc) { - kfree(info); - return ERR_PTR(-ENODEV); - } - if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) - dev_warn(dev, "already attached to IOMMU device directory\n"); - WRITE_ONCE(dc->tc, tc); - } - - dev_iommu_priv_set(dev, info); - return &iommu->iommu; } -static void riscv_iommu_release_device(struct device *dev) -{ - struct riscv_iommu_info *info = dev_iommu_priv_get(dev); - - kfree_rcu_mightsleep(info); -} - static const struct iommu_ops riscv_iommu_ops = { - .pgsize_bitmap = SZ_4K, .of_xlate = riscv_iommu_of_xlate, .identity_domain = &riscv_iommu_identity_domain, - .blocked_domain = &riscv_iommu_blocking_domain, - .release_domain = &riscv_iommu_blocking_domain, - .domain_alloc_paging = riscv_iommu_alloc_paging_domain, + .def_domain_type = riscv_iommu_device_domain_type, .device_group = riscv_iommu_device_group, .probe_device = riscv_iommu_probe_device, - .release_device = riscv_iommu_release_device, }; static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) @@ -1568,24 +124,6 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) return -EINVAL; } - /* - * Distribute interrupt vectors, always use first vector for CIV. - * At least one interrupt is required. Read back and verify. - */ - if (!iommu->irqs_count) - return -EINVAL; - - iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | - FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | - FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); - riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); - iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); - if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), - FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), - max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), - FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) - return -EINVAL; - return 0; } @@ -1593,54 +131,29 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu) { iommu_device_unregister(&iommu->iommu); iommu_device_sysfs_remove(&iommu->iommu); - riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); - riscv_iommu_queue_disable(&iommu->cmdq); - riscv_iommu_queue_disable(&iommu->fltq); } int riscv_iommu_init(struct riscv_iommu_device *iommu) { int rc; - RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); - RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); - rc = riscv_iommu_init_check(iommu); if (rc) return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); - rc = riscv_iommu_iodir_alloc(iommu); - if (rc) - return rc; - - rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, - sizeof(struct riscv_iommu_command)); - if (rc) - return rc; - - rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, - sizeof(struct riscv_iommu_fq_record)); - if (rc) - return rc; - - rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); - if (rc) - return rc; - - rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); - if (rc) - goto err_queue_disable; - - rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); - if (rc) - goto err_queue_disable; + /* + * Placeholder for a complete IOMMU device initialization. For now, + * only bare minimum: enable global identity mapping mode and register sysfs. + */ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", dev_name(iommu->dev)); - if (rc) { - dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); - goto err_iodir_off; - } + if (rc) + return dev_err_probe(iommu->dev, rc, + "cannot register sysfs interface\n"); rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); if (rc) { @@ -1652,10 +165,5 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_iodir_off: - riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); -err_queue_disable: - riscv_iommu_queue_disable(&iommu->fltq); - riscv_iommu_queue_disable(&iommu->cmdq); return rc; } diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index b1c4664542b48b3442c80205a9608e82c896940d..700e33dc24466023c953abc7f99b938265f1cca7 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -17,22 +17,6 @@ #include "iommu-bits.h" -struct riscv_iommu_device; - -struct riscv_iommu_queue { - atomic_t prod; /* unbounded producer allocation index */ - atomic_t head; /* unbounded shadow ring buffer consumer index */ - atomic_t tail; /* unbounded shadow ring buffer producer index */ - unsigned int mask; /* index mask, queue length - 1 */ - unsigned int irq; /* allocated interrupt number */ - struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */ - void *base; /* ring buffer kernel pointer */ - dma_addr_t phys; /* ring buffer physical address */ - u16 qbr; /* base register offset, head and tail reference */ - u16 qcr; /* control and status register offset */ - u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */ -}; - struct riscv_iommu_device { /* iommu core interface */ struct iommu_device iommu; @@ -50,16 +34,6 @@ struct riscv_iommu_device { /* available interrupt numbers, MSI or WSI */ unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; unsigned int irqs_count; - unsigned int icvec; - - /* hardware queues */ - struct riscv_iommu_queue cmdq; - struct riscv_iommu_queue fltq; - - /* device directory */ - unsigned int ddt_mode; - dma_addr_t ddt_phys; - u64 *ddt_root; }; int riscv_iommu_init(struct riscv_iommu_device *iommu); diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index c49c5e4d4f808f9206f79faa1a42369268b51d17..e77a20f1dff8570a37b44b32f475b40b9d7b21b7 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -113,7 +113,6 @@ struct rk_iommu { struct iommu_device iommu; struct list_head node; /* entry in rk_iommu_domain.iommus */ struct iommu_domain *domain; /* domain to which iommu is attached */ - struct iommu_group *group; }; struct rk_iommudata { @@ -817,7 +816,8 @@ static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr, } static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; @@ -850,12 +850,14 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, paddr, size, prot); spin_unlock_irqrestore(&rk_domain->dt_lock, flags); + if (!ret) + *mapped = size; return ret; } static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; @@ -1137,15 +1139,6 @@ static void rk_iommu_release_device(struct device *dev) device_link_del(data->link); } -static struct iommu_group *rk_iommu_device_group(struct device *dev) -{ - struct rk_iommu *iommu; - - iommu = rk_iommu_from_dev(dev); - - return iommu_group_ref_get(iommu->group); -} - static int rk_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) { @@ -1171,13 +1164,13 @@ static const struct iommu_ops rk_iommu_ops = { .domain_alloc_paging = rk_iommu_domain_alloc_paging, .probe_device = rk_iommu_probe_device, .release_device = rk_iommu_release_device, - .device_group = rk_iommu_device_group, + .device_group = generic_single_device_group, .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP, .of_xlate = rk_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = rk_iommu_attach_device, - .map = rk_iommu_map, - .unmap = rk_iommu_unmap, + .map_pages = rk_iommu_map, + .unmap_pages = rk_iommu_unmap, .iova_to_phys = rk_iommu_iova_to_phys, .free = rk_iommu_domain_free, } @@ -1261,15 +1254,9 @@ static int rk_iommu_probe(struct platform_device *pdev) if (err) return err; - iommu->group = iommu_group_alloc(); - if (IS_ERR(iommu->group)) { - err = PTR_ERR(iommu->group); - goto err_unprepare_clocks; - } - err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); if (err) - goto err_put_group; + goto err_unprepare_clocks; err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev); if (err) @@ -1306,8 +1293,6 @@ static int rk_iommu_probe(struct platform_device *pdev) pm_runtime_disable(dev); err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_put_group: - iommu_group_put(iommu->group); err_unprepare_clocks: clk_bulk_unprepare(iommu->num_clocks, iommu->clocks); return err; diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 5695ad71d60e244b5a35466436044608992b77ef..9a5196f523de5f0f8902f054b1b3722d14b18cd2 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -14,16 +14,300 @@ #include #include +#include "dma-iommu.h" + static const struct iommu_ops s390_iommu_ops; +static struct kmem_cache *dma_region_table_cache; +static struct kmem_cache *dma_page_table_cache; + +static u64 s390_iommu_aperture; +static u32 s390_iommu_aperture_factor = 1; + struct s390_domain { struct iommu_domain domain; struct list_head devices; + struct zpci_iommu_ctrs ctrs; unsigned long *dma_table; spinlock_t list_lock; struct rcu_head rcu; }; +static inline unsigned int calc_rtx(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_sx(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_px(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> PAGE_SHIFT) & ZPCI_PT_MASK; +} + +static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa) +{ + *entry &= ZPCI_PTE_FLAG_MASK; + *entry |= (pfaa & ZPCI_PTE_ADDR_MASK); +} + +static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto) +{ + *entry &= ZPCI_RTE_FLAG_MASK; + *entry |= (sto & ZPCI_RTE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_RTX; +} + +static inline void set_st_pto(unsigned long *entry, phys_addr_t pto) +{ + *entry &= ZPCI_STE_FLAG_MASK; + *entry |= (pto & ZPCI_STE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_SX; +} + +static inline void validate_rt_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry &= ~ZPCI_TABLE_OFFSET_MASK; + *entry |= ZPCI_TABLE_VALID; + *entry |= ZPCI_TABLE_LEN_RTX; +} + +static inline void validate_st_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry |= ZPCI_TABLE_VALID; +} + +static inline void invalidate_pt_entry(unsigned long *entry) +{ + WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); + *entry &= ~ZPCI_PTE_VALID_MASK; + *entry |= ZPCI_PTE_INVALID; +} + +static inline void validate_pt_entry(unsigned long *entry) +{ + WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID); + *entry &= ~ZPCI_PTE_VALID_MASK; + *entry |= ZPCI_PTE_VALID; +} + +static inline void entry_set_protected(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_PROT_MASK; + *entry |= ZPCI_TABLE_PROTECTED; +} + +static inline void entry_clr_protected(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_PROT_MASK; + *entry |= ZPCI_TABLE_UNPROTECTED; +} + +static inline int reg_entry_isvalid(unsigned long entry) +{ + return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; +} + +static inline int pt_entry_isvalid(unsigned long entry) +{ + return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; +} + +static inline unsigned long *get_rt_sto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) + return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); + else + return NULL; +} + +static inline unsigned long *get_st_pto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX) + return phys_to_virt(entry & ZPCI_STE_ADDR_MASK); + else + return NULL; +} + +static int __init dma_alloc_cpu_table_caches(void) +{ + dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables", + ZPCI_TABLE_SIZE, + ZPCI_TABLE_ALIGN, + 0, NULL); + if (!dma_region_table_cache) + return -ENOMEM; + + dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables", + ZPCI_PT_SIZE, + ZPCI_PT_ALIGN, + 0, NULL); + if (!dma_page_table_cache) { + kmem_cache_destroy(dma_region_table_cache); + return -ENOMEM; + } + return 0; +} + +static unsigned long *dma_alloc_cpu_table(gfp_t gfp) +{ + unsigned long *table, *entry; + + table = kmem_cache_alloc(dma_region_table_cache, gfp); + if (!table) + return NULL; + + for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++) + *entry = ZPCI_TABLE_INVALID; + return table; +} + +static void dma_free_cpu_table(void *table) +{ + kmem_cache_free(dma_region_table_cache, table); +} + +static void dma_free_page_table(void *table) +{ + kmem_cache_free(dma_page_table_cache, table); +} + +static void dma_free_seg_table(unsigned long entry) +{ + unsigned long *sto = get_rt_sto(entry); + int sx; + + for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++) + if (reg_entry_isvalid(sto[sx])) + dma_free_page_table(get_st_pto(sto[sx])); + + dma_free_cpu_table(sto); +} + +static void dma_cleanup_tables(unsigned long *table) +{ + int rtx; + + if (!table) + return; + + for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) + if (reg_entry_isvalid(table[rtx])) + dma_free_seg_table(table[rtx]); + + dma_free_cpu_table(table); +} + +static unsigned long *dma_alloc_page_table(gfp_t gfp) +{ + unsigned long *table, *entry; + + table = kmem_cache_alloc(dma_page_table_cache, gfp); + if (!table) + return NULL; + + for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++) + *entry = ZPCI_PTE_INVALID; + return table; +} + +static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp) +{ + unsigned long old_rte, rte; + unsigned long *sto; + + rte = READ_ONCE(*rtep); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + } else { + sto = dma_alloc_cpu_table(gfp); + if (!sto) + return NULL; + + set_rt_sto(&rte, virt_to_phys(sto)); + validate_rt_entry(&rte); + entry_clr_protected(&rte); + + old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte); + if (old_rte != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(sto); + sto = get_rt_sto(old_rte); + } + } + return sto; +} + +static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp) +{ + unsigned long old_ste, ste; + unsigned long *pto; + + ste = READ_ONCE(*step); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + } else { + pto = dma_alloc_page_table(gfp); + if (!pto) + return NULL; + set_st_pto(&ste, virt_to_phys(pto)); + validate_st_entry(&ste); + entry_clr_protected(&ste); + + old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste); + if (old_ste != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_page_table(pto); + pto = get_st_pto(old_ste); + } + } + return pto; +} + +static unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, gfp_t gfp) +{ + unsigned long *sto, *pto; + unsigned int rtx, sx, px; + + rtx = calc_rtx(dma_addr); + sto = dma_get_seg_table_origin(&rto[rtx], gfp); + if (!sto) + return NULL; + + sx = calc_sx(dma_addr); + pto = dma_get_page_table_origin(&sto[sx], gfp); + if (!pto) + return NULL; + + px = calc_px(dma_addr); + return &pto[px]; +} + +static void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags) +{ + unsigned long pte; + + pte = READ_ONCE(*ptep); + if (flags & ZPCI_PTE_INVALID) { + invalidate_pt_entry(&pte); + } else { + set_pt_pfaa(&pte, page_addr); + validate_pt_entry(&pte); + } + + if (flags & ZPCI_TABLE_PROTECTED) + entry_set_protected(&pte); + else + entry_clr_protected(&pte); + + xchg(ptep, pte); +} + static struct s390_domain *to_s390_domain(struct iommu_domain *dom) { return container_of(dom, struct s390_domain, domain); @@ -31,9 +315,13 @@ static struct s390_domain *to_s390_domain(struct iommu_domain *dom) static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap) { + struct zpci_dev *zdev = to_zpci_dev(dev); + switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return true; + case IOMMU_CAP_DEFERRED_FLUSH: + return zdev->pft != PCI_FUNC_TYPE_ISM; default: return false; } @@ -81,14 +369,13 @@ static void s390_domain_free(struct iommu_domain *domain) call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain); } -static void __s390_iommu_detach_device(struct zpci_dev *zdev) +static void s390_iommu_detach_device(struct iommu_domain *domain, + struct device *dev) { - struct s390_domain *s390_domain = zdev->s390_domain; + struct s390_domain *s390_domain = to_s390_domain(domain); + struct zpci_dev *zdev = to_zpci_dev(dev); unsigned long flags; - if (!s390_domain) - return; - spin_lock_irqsave(&s390_domain->list_lock, flags); list_del_rcu(&zdev->iommu_list); spin_unlock_irqrestore(&s390_domain->list_lock, flags); @@ -115,9 +402,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return -EINVAL; if (zdev->s390_domain) - __s390_iommu_detach_device(zdev); - else if (zdev->dma_table) - zpci_dma_exit_device(zdev); + s390_iommu_detach_device(&zdev->s390_domain->domain, dev); cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, virt_to_phys(s390_domain->dma_table), &status); @@ -127,7 +412,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, */ if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) return -EIO; - zdev->dma_table = s390_domain->dma_table; zdev->dma_table = s390_domain->dma_table; zdev->s390_domain = s390_domain; @@ -139,31 +423,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return 0; } -/* - * Switch control over the IOMMU to S390's internal dma_api ops - */ -static int s390_iommu_platform_attach(struct iommu_domain *platform_domain, - struct device *dev) -{ - struct zpci_dev *zdev = to_zpci_dev(dev); - - if (!zdev->s390_domain) - return 0; - - __s390_iommu_detach_device(zdev); - zpci_dma_init_device(zdev); - return 0; -} - -static struct iommu_domain_ops s390_iommu_platform_ops = { - .attach_dev = s390_iommu_platform_attach, -}; - -static struct iommu_domain s390_iommu_platform_domain = { - .type = IOMMU_DOMAIN_PLATFORM, - .ops = &s390_iommu_platform_ops, -}; - static void s390_iommu_get_resv_regions(struct device *dev, struct list_head *list) { @@ -204,6 +463,9 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1) zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1; + if (zdev->tlb_refresh) + dev->iommu->shadow_on_flush = 1; + return &zdev->iommu_dev; } @@ -216,7 +478,13 @@ static void s390_iommu_release_device(struct device *dev) * to the device, but keep it attached to other devices in the group. */ if (zdev) - __s390_iommu_detach_device(zdev); + s390_iommu_detach_device(&zdev->s390_domain->domain, dev); +} + +static int zpci_refresh_all(struct zpci_dev *zdev) +{ + return zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); } static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) @@ -226,8 +494,8 @@ static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) rcu_read_lock(); list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { - zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, - zdev->end_dma - zdev->start_dma + 1); + atomic64_inc(&s390_domain->ctrs.global_rpcits); + zpci_refresh_all(zdev); } rcu_read_unlock(); } @@ -245,26 +513,40 @@ static void s390_iommu_iotlb_sync(struct iommu_domain *domain, rcu_read_lock(); list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { + atomic64_inc(&s390_domain->ctrs.sync_rpcits); zpci_refresh_trans((u64)zdev->fh << 32, gather->start, size); } rcu_read_unlock(); } -static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int s390_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev; + int ret = 0; rcu_read_lock(); list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { if (!zdev->tlb_refresh) continue; - zpci_refresh_trans((u64)zdev->fh << 32, - iova, size); + atomic64_inc(&s390_domain->ctrs.sync_map_rpcits); + ret = zpci_refresh_trans((u64)zdev->fh << 32, + iova, size); + /* + * let the hypervisor discover invalidated entries + * allowing it to free IOVAs and unpin pages + */ + if (ret == -ENOMEM) { + ret = zpci_refresh_all(zdev); + if (ret) + break; + } } rcu_read_unlock(); + + return ret; } static int s390_iommu_validate_trans(struct s390_domain *s390_domain, @@ -344,16 +626,15 @@ static int s390_iommu_map_pages(struct iommu_domain *domain, if (!IS_ALIGNED(iova | paddr, pgsize)) return -EINVAL; - if (!(prot & IOMMU_READ)) - return -EINVAL; - if (!(prot & IOMMU_WRITE)) flags |= ZPCI_TABLE_PROTECTED; rc = s390_iommu_validate_trans(s390_domain, paddr, iova, - pgcount, flags, gfp); - if (!rc) + pgcount, flags, gfp); + if (!rc) { *mapped = size; + atomic64_add(pgcount, &s390_domain->ctrs.mapped_pages); + } return rc; } @@ -409,12 +690,26 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, return 0; iommu_iotlb_gather_add_range(gather, iova, size); + atomic64_add(pgcount, &s390_domain->ctrs.unmapped_pages); return size; } +static void s390_iommu_probe_finalize(struct device *dev) +{ + iommu_setup_dma_ops(dev, 0, U64_MAX); +} + +struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) +{ + if (!zdev || !zdev->s390_domain) + return NULL; + return &zdev->s390_domain->ctrs; +} + int zpci_init_iommu(struct zpci_dev *zdev) { + u64 aperture_size; int rc = 0; rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL, @@ -426,6 +721,12 @@ int zpci_init_iommu(struct zpci_dev *zdev) if (rc) goto out_sysfs; + zdev->start_dma = PAGE_ALIGN(zdev->start_dma); + aperture_size = min3(s390_iommu_aperture, + ZPCI_TABLE_SIZE_RT - zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); + zdev->end_dma = zdev->start_dma + aperture_size - 1; + return 0; out_sysfs: @@ -441,11 +742,50 @@ void zpci_destroy_iommu(struct zpci_dev *zdev) iommu_device_sysfs_remove(&zdev->iommu_dev); } +static int __init s390_iommu_setup(char *str) +{ + if (!strcmp(str, "strict")) { + pr_warn("s390_iommu=strict deprecated; use iommu.strict=1 instead\n"); + iommu_set_dma_strict(); + } + return 1; +} + +__setup("s390_iommu=", s390_iommu_setup); + +static int __init s390_iommu_aperture_setup(char *str) +{ + if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) + s390_iommu_aperture_factor = 1; + return 1; +} + +__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); + +static int __init s390_iommu_init(void) +{ + int rc; + + iommu_dma_forcedac = true; + s390_iommu_aperture = (u64)virt_to_phys(high_memory); + if (!s390_iommu_aperture_factor) + s390_iommu_aperture = ULONG_MAX; + else + s390_iommu_aperture *= s390_iommu_aperture_factor; + + rc = dma_alloc_cpu_table_caches(); + if (rc) + return rc; + + return rc; +} +subsys_initcall(s390_iommu_init); + static const struct iommu_ops s390_iommu_ops = { - .default_domain = &s390_iommu_platform_domain, .capable = s390_iommu_capable, .domain_alloc_paging = s390_domain_alloc_paging, .probe_device = s390_iommu_probe_device, + .probe_finalize = s390_iommu_probe_finalize, .release_device = s390_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = SZ_4K, diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index 4b9edbb33a9513c940f9dcba4aa580bd168414c0..a2f4ffe6d9491c1e495439cd031592dacbdaef1b 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -70,7 +70,6 @@ struct sprd_iommu_device { void __iomem *base; struct device *dev; struct iommu_device iommu; - struct iommu_group *group; struct clk *eb; }; @@ -342,8 +341,8 @@ static size_t sprd_iommu_unmap(struct iommu_domain *domain, unsigned long iova, return size; } -static void sprd_iommu_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int sprd_iommu_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct sprd_iommu_domain *dom = to_sprd_domain(domain); unsigned int reg; @@ -355,6 +354,7 @@ static void sprd_iommu_sync_map(struct iommu_domain *domain, /* clear IOMMU TLB buffer after page table updated */ sprd_iommu_write(dom->sdev, reg, 0xffffffff); + return 0; } static void sprd_iommu_sync(struct iommu_domain *domain, @@ -384,23 +384,10 @@ static phys_addr_t sprd_iommu_iova_to_phys(struct iommu_domain *domain, } static struct iommu_device *sprd_iommu_probe_device(struct device *dev) -{ - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct sprd_iommu_device *sdev; - - if (!fwspec || fwspec->ops != &sprd_iommu_ops) - return ERR_PTR(-ENODEV); - - sdev = dev_iommu_priv_get(dev); - - return &sdev->iommu; -} - -static struct iommu_group *sprd_iommu_device_group(struct device *dev) { struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); - return iommu_group_ref_get(sdev->group); + return &sdev->iommu; } static int sprd_iommu_of_xlate(struct device *dev, @@ -421,7 +408,7 @@ static int sprd_iommu_of_xlate(struct device *dev, static const struct iommu_ops sprd_iommu_ops = { .domain_alloc_paging = sprd_iommu_domain_alloc_paging, .probe_device = sprd_iommu_probe_device, - .device_group = sprd_iommu_device_group, + .device_group = generic_single_device_group, .of_xlate = sprd_iommu_of_xlate, .pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE, .owner = THIS_MODULE, @@ -494,16 +481,9 @@ static int sprd_iommu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sdev); sdev->dev = dev; - /* All the client devices are in the same iommu-group */ - sdev->group = iommu_group_alloc(); - if (IS_ERR(sdev->group)) { - ret = PTR_ERR(sdev->group); - goto free_page; - } - ret = iommu_device_sysfs_add(&sdev->iommu, dev, NULL, dev_name(dev)); if (ret) - goto put_group; + goto free_page; ret = iommu_device_register(&sdev->iommu, &sprd_iommu_ops, dev); if (ret) @@ -528,8 +508,6 @@ static int sprd_iommu_probe(struct platform_device *pdev) iommu_device_unregister(&sdev->iommu); remove_sysfs: iommu_device_sysfs_remove(&sdev->iommu); -put_group: - iommu_group_put(sdev->group); free_page: dma_free_coherent(sdev->dev, SPRD_IOMMU_PAGE_SIZE, sdev->prot_page_va, sdev->prot_page_pa); return ret; @@ -541,9 +519,6 @@ static void sprd_iommu_remove(struct platform_device *pdev) dma_free_coherent(sdev->dev, SPRD_IOMMU_PAGE_SIZE, sdev->prot_page_va, sdev->prot_page_pa); - iommu_group_put(sdev->group); - sdev->group = NULL; - platform_set_drvdata(pdev, NULL); iommu_device_sysfs_remove(&sdev->iommu); iommu_device_unregister(&sdev->iommu); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index f8148b97e309406cc73cb72cffe7a956c834f0f1..9896b940df66666eaba10c22c88d1eb2d0b0c7ce 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -107,7 +107,6 @@ struct sun50i_iommu { struct clk *clk; struct iommu_domain *domain; - struct iommu_group *group; struct kmem_cache *pt_pool; }; @@ -402,8 +401,8 @@ static void sun50i_iommu_flush_iotlb_all(struct iommu_domain *domain) spin_unlock_irqrestore(&iommu->iommu_lock, flags); } -static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, - unsigned long iova, size_t size) +static int sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu = sun50i_domain->iommu; @@ -412,6 +411,8 @@ static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, spin_lock_irqsave(&iommu->iommu_lock, flags); sun50i_iommu_zap_range(iommu, iova, size); spin_unlock_irqrestore(&iommu->iommu_lock, flags); + + return 0; } static void sun50i_iommu_iotlb_sync(struct iommu_domain *domain, @@ -590,7 +591,8 @@ static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain, } static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu = sun50i_domain->iommu; @@ -617,13 +619,14 @@ static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova, *pte_addr = sun50i_mk_pte(paddr, prot); sun50i_table_flush(sun50i_domain, pte_addr, 1); + *mapped = size; out: return ret; } static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); phys_addr_t pt_phys; @@ -816,13 +819,6 @@ static struct iommu_device *sun50i_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static struct iommu_group *sun50i_iommu_device_group(struct device *dev) -{ - struct sun50i_iommu *iommu = sun50i_iommu_from_dev(dev); - - return iommu_group_ref_get(iommu->group); -} - static int sun50i_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) { @@ -837,7 +833,7 @@ static int sun50i_iommu_of_xlate(struct device *dev, static const struct iommu_ops sun50i_iommu_ops = { .identity_domain = &sun50i_iommu_identity_domain, .pgsize_bitmap = SZ_4K, - .device_group = sun50i_iommu_device_group, + .device_group = generic_single_device_group, .domain_alloc_paging = sun50i_iommu_domain_alloc_paging, .of_xlate = sun50i_iommu_of_xlate, .probe_device = sun50i_iommu_probe_device, @@ -847,8 +843,8 @@ static const struct iommu_ops sun50i_iommu_ops = { .iotlb_sync_map = sun50i_iommu_iotlb_sync_map, .iotlb_sync = sun50i_iommu_iotlb_sync, .iova_to_phys = sun50i_iommu_iova_to_phys, - .map = sun50i_iommu_map, - .unmap = sun50i_iommu_unmap, + .map_pages = sun50i_iommu_map, + .unmap_pages = sun50i_iommu_unmap, .free = sun50i_iommu_domain_free, } }; @@ -1005,42 +1001,36 @@ static int sun50i_iommu_probe(struct platform_device *pdev) if (!iommu->pt_pool) return -ENOMEM; - iommu->group = iommu_group_alloc(); - if (IS_ERR(iommu->group)) { - ret = PTR_ERR(iommu->group); - goto err_free_cache; - } - iommu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(iommu->base)) { ret = PTR_ERR(iommu->base); - goto err_free_group; + goto err_free_cache; } irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = irq; - goto err_free_group; + goto err_free_cache; } iommu->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(iommu->clk)) { dev_err(&pdev->dev, "Couldn't get our clock.\n"); ret = PTR_ERR(iommu->clk); - goto err_free_group; + goto err_free_cache; } iommu->reset = devm_reset_control_get(&pdev->dev, NULL); if (IS_ERR(iommu->reset)) { dev_err(&pdev->dev, "Couldn't get our reset line.\n"); ret = PTR_ERR(iommu->reset); - goto err_free_group; + goto err_free_cache; } ret = iommu_device_sysfs_add(&iommu->iommu, &pdev->dev, NULL, dev_name(&pdev->dev)); if (ret) - goto err_free_group; + goto err_free_cache; ret = iommu_device_register(&iommu->iommu, &sun50i_iommu_ops, &pdev->dev); if (ret) @@ -1059,9 +1049,6 @@ static int sun50i_iommu_probe(struct platform_device *pdev) err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_free_group: - iommu_group_put(iommu->group); - err_free_cache: kmem_cache_destroy(iommu->pt_pool); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index d212b03cd5b3953cdbdbfdbfa55459d7bd84e7fb..14e525bd0d9bb01de32ef28a2497721e879c1db4 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -764,7 +764,8 @@ __tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, } static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t count, + int prot, gfp_t gfp, size_t *mapped) { struct tegra_smmu_as *as = to_smmu_as(domain); unsigned long flags; @@ -774,11 +775,14 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, ret = __tegra_smmu_map(domain, iova, paddr, size, prot, gfp, &flags); spin_unlock_irqrestore(&as->lock, flags); + if (!ret) + *mapped = size; + return ret; } static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t size, size_t count, struct iommu_iotlb_gather *gather) { struct tegra_smmu_as *as = to_smmu_as(domain); unsigned long flags; @@ -995,8 +999,8 @@ static const struct iommu_ops tegra_smmu_ops = { .pgsize_bitmap = SZ_4K, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = tegra_smmu_attach_dev, - .map = tegra_smmu_map, - .unmap = tegra_smmu_unmap, + .map_pages = tegra_smmu_map, + .unmap_pages = tegra_smmu_unmap, .iova_to_phys = tegra_smmu_iova_to_phys, .free = tegra_smmu_domain_free, } @@ -1080,8 +1084,6 @@ DEFINE_SHOW_ATTRIBUTE(tegra_smmu_clients); static void tegra_smmu_debugfs_init(struct tegra_smmu *smmu) { smmu->debugfs = debugfs_create_dir("smmu", NULL); - if (!smmu->debugfs) - return; debugfs_create_file("swgroups", S_IRUGO, smmu->debugfs, smmu, &tegra_smmu_swgroups_fops); diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index afede0a45bd35bb008799ad0cc122d61f73acac3..04048f64a2c0f436df56791e88eba6f9ba3e3f1f 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -85,7 +85,7 @@ struct viommu_request { void *writeback; unsigned int write_offset; unsigned int len; - char buf[]; + char buf[] __counted_by(len); }; #define VIOMMU_FAULT_RESV_MASK 0xffffff00 @@ -230,7 +230,7 @@ static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len, if (write_offset <= 0) return -EINVAL; - req = kzalloc(sizeof(*req) + len, GFP_ATOMIC); + req = kzalloc(struct_size(req, buf, len), GFP_ATOMIC); if (!req) return -ENOMEM; @@ -843,7 +843,7 @@ static int viommu_map_pages(struct iommu_domain *domain, unsigned long iova, .flags = cpu_to_le32(flags), }; - ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); + ret = viommu_add_req(vdomain->viommu, &map, sizeof(map)); if (ret) { viommu_del_mappings(vdomain, iova, end); return ret; @@ -912,6 +912,33 @@ static void viommu_iotlb_sync(struct iommu_domain *domain, viommu_sync_req(vdomain->viommu); } +static int viommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct viommu_domain *vdomain = to_viommu_domain(domain); + + /* + * May be called before the viommu is initialized including + * while creating direct mapping + */ + if (!vdomain->nr_endpoints) + return 0; + return viommu_sync_req(vdomain->viommu); +} + +static void viommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct viommu_domain *vdomain = to_viommu_domain(domain); + + /* + * May be called before the viommu is initialized including + * while creating direct mapping + */ + if (!vdomain->nr_endpoints) + return; + viommu_sync_req(vdomain->viommu); +} + static void viommu_get_resv_regions(struct device *dev, struct list_head *head) { struct iommu_resv_region *entry, *new_entry, *msi = NULL; @@ -969,9 +996,6 @@ static struct iommu_device *viommu_probe_device(struct device *dev) struct viommu_dev *viommu = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - if (!fwspec || fwspec->ops != &viommu_ops) - return ERR_PTR(-ENODEV); - viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode); if (!viommu) return ERR_PTR(-ENODEV); @@ -1038,6 +1062,8 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap) switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return true; + case IOMMU_CAP_DEFERRED_FLUSH: + return true; default: return false; } @@ -1058,7 +1084,9 @@ static struct iommu_ops viommu_ops = { .map_pages = viommu_map_pages, .unmap_pages = viommu_unmap_pages, .iova_to_phys = viommu_iova_to_phys, + .flush_iotlb_all = viommu_flush_iotlb_all, .iotlb_sync = viommu_iotlb_sync, + .iotlb_sync_map = viommu_iotlb_sync_map, .free = viommu_domain_free, } }; diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 58e22651404cfea5c749914b8d5c95b6a80f1dbe..f4932790dae1dc068d8bf453a7cc5234cd5973f7 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -644,6 +644,8 @@ struct acpi_pci_root { /* helper */ +struct iommu_ops; + bool acpi_dma_supported(const struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_iommu_fwspec_init(struct device *dev, u32 id, diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 99a5201d9e625796f762b35beb32b1b523497019..dc7ed2f4688614a992eac65d11e25d841943ae45 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -33,126 +33,6 @@ struct pci_dev; extern int amd_iommu_detect(void); -/** - * amd_iommu_init_device() - Init device for use with IOMMUv2 driver - * @pdev: The PCI device to initialize - * @pasids: Number of PASIDs to support for this device - * - * This function does all setup for the device pdev so that it can be - * used with IOMMUv2. - * Returns 0 on success or negative value on error. - */ -extern int amd_iommu_init_device(struct pci_dev *pdev, int pasids); - -/** - * amd_iommu_free_device() - Free all IOMMUv2 related device resources - * and disable IOMMUv2 usage for this device - * @pdev: The PCI device to disable IOMMUv2 usage for' - */ -extern void amd_iommu_free_device(struct pci_dev *pdev); - -/** - * amd_iommu_bind_pasid() - Bind a given task to a PASID on a device - * @pdev: The PCI device to bind the task to - * @pasid: The PASID on the device the task should be bound to - * @task: the task to bind - * - * The function returns 0 on success or a negative value on error. - */ -extern int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid, - struct task_struct *task); - -/** - * amd_iommu_unbind_pasid() - Unbind a PASID from its task on - * a device - * @pdev: The device of the PASID - * @pasid: The PASID to unbind - * - * When this function returns the device is no longer using the PASID - * and the PASID is no longer bound to its task. - */ -extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid); - -/** - * amd_iommu_set_invalid_ppr_cb() - Register a call-back for failed - * PRI requests - * @pdev: The PCI device the call-back should be registered for - * @cb: The call-back function - * - * The IOMMUv2 driver invokes this call-back when it is unable to - * successfully handle a PRI request. The device driver can then decide - * which PRI response the device should see. Possible return values for - * the call-back are: - * - * - AMD_IOMMU_INV_PRI_RSP_SUCCESS - Send SUCCESS back to the device - * - AMD_IOMMU_INV_PRI_RSP_INVALID - Send INVALID back to the device - * - AMD_IOMMU_INV_PRI_RSP_FAIL - Send Failure back to the device, - * the device is required to disable - * PRI when it receives this response - * - * The function returns 0 on success or negative value on error. - */ -#define AMD_IOMMU_INV_PRI_RSP_SUCCESS 0 -#define AMD_IOMMU_INV_PRI_RSP_INVALID 1 -#define AMD_IOMMU_INV_PRI_RSP_FAIL 2 - -typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev, - u32 pasid, - unsigned long address, - u16); - -extern int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, - amd_iommu_invalid_ppr_cb cb); - -#define PPR_FAULT_EXEC (1 << 1) -#define PPR_FAULT_READ (1 << 2) -#define PPR_FAULT_WRITE (1 << 5) -#define PPR_FAULT_USER (1 << 6) -#define PPR_FAULT_RSVD (1 << 7) -#define PPR_FAULT_GN (1 << 8) - -/** - * amd_iommu_device_info() - Get information about IOMMUv2 support of a - * PCI device - * @pdev: PCI device to query information from - * @info: A pointer to an amd_iommu_device_info structure which will contain - * the information about the PCI device - * - * Returns 0 on success, negative value on error - */ - -#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP 0x1 /* ATS feature supported */ -#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP 0x2 /* PRI feature supported */ -#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP 0x4 /* PASID context supported */ -#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP 0x8 /* Device may request execution - on memory pages */ -#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP 0x10 /* Device may request - super-user privileges */ - -struct amd_iommu_device_info { - int max_pasids; - u32 flags; -}; - -extern int amd_iommu_device_info(struct pci_dev *pdev, - struct amd_iommu_device_info *info); - -/** - * amd_iommu_set_invalidate_ctx_cb() - Register a call-back for invalidating - * a pasid context. This call-back is - * invoked when the IOMMUv2 driver needs to - * invalidate a PASID context, for example - * because the task that is bound to that - * context is about to exit. - * - * @pdev: The PCI device the call-back should be registered for - * @cb: The call-back function - */ - -typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, u32 pasid); - -extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, - amd_iommu_invalidate_ctx cb); #else /* CONFIG_AMD_IOMMU */ static inline int amd_iommu_detect(void) { return -ENODEV; } diff --git a/include/linux/device.h b/include/linux/device.h index 4e2cd21e9f5e6c2cf2801a6ccaf9ff290f586fef..af370f5eca48c320876bb87bcef50f1f453ea5a8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -42,7 +42,6 @@ struct class; struct subsys_private; struct device_node; struct fwnode_handle; -struct iommu_ops; struct iommu_group; struct dev_pin_info; struct dev_iommu; diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index ccaa3cf10c56469d96a0a241f7970fd2c8513357..4cddee8faf8e71ad7fec187e2d687d4a75df1158 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -62,9 +62,6 @@ struct fwnode_handle; * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. - * @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU - * driver implementations to a bus and allow the driver to do - * bus-specific setup * @need_parent_lock: When probing or removing a device on this bus, the * device core should lock the device's parent. * @@ -104,8 +101,6 @@ struct bus_type { const struct dev_pm_ops *pm; - const struct iommu_ops *iommu_ops; - bool need_parent_lock; CK_KABI_RESERVE(1) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 7b5b7dd2cd954dcb2e11fb6901dc6c1d024e3213..348ea5320e78248bd40cbe6b3edba90cc6475019 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -11,6 +11,7 @@ #include struct cma; +struct iommu_ops; /* * Values for struct dma_map_ops.flags: diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 25142a0e2fc2c51d4c7807a1fb87cc21b16a163b..86cf1f7ae389a40180b86dd6850102f6fe04c188 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -100,6 +100,30 @@ struct io_pgtable_cfg { const struct iommu_flush_ops *tlb; struct device *iommu_dev; + /** + * @alloc: Custom page allocator. + * + * Optional hook used to allocate page tables. If this function is NULL, + * @free must be NULL too. + * + * Memory returned should be zeroed and suitable for dma_map_single() and + * virt_to_phys(). + * + * Not all formats support custom page allocators. Before considering + * passing a non-NULL value, make sure the chosen page format supports + * this feature. + */ + void *(*alloc)(void *cookie, size_t size, gfp_t gfp); + + /** + * @free: Custom page de-allocator. + * + * Optional hook used to free page tables allocated with the @alloc + * hook. Must be non-NULL if @alloc is not NULL, must be NULL + * otherwise. + */ + void (*free)(void *cookie, void *pages, size_t size); + /* Low-level data specific to the table format */ union { struct { @@ -241,16 +265,26 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop, iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie); } +/** + * enum io_pgtable_caps - IO page table backend capabilities. + */ +enum io_pgtable_caps { + /** @IO_PGTABLE_CAP_CUSTOM_ALLOCATOR: Backend accepts custom page table allocators. */ + IO_PGTABLE_CAP_CUSTOM_ALLOCATOR = BIT(0), +}; + /** * struct io_pgtable_init_fns - Alloc/free a set of page tables for a * particular format. * * @alloc: Allocate a set of page tables described by cfg. * @free: Free the page tables associated with iop. + * @caps: Combination of @io_pgtable_caps flags encoding the backend capabilities. */ struct io_pgtable_init_fns { struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie); void (*free)(struct io_pgtable *iop); + u32 caps; }; extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 11c5ba0d2911cc1b43fba15989c9d4b67410f304..b9d74dcf3ffaa99bc75f1b1fde36aa7108dbcd21 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -68,6 +68,9 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ #define __IOMMU_DOMAIN_PLATFORM (1U << 5) +#define __IOMMU_DOMAIN_NESTED (1U << 6) /* User-managed address space nested + on a stage-2 translation */ + #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ /* * This are the possible domain-types @@ -97,12 +100,13 @@ struct iommu_domain_geometry { __IOMMU_DOMAIN_DMA_FQ) #define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) #define IOMMU_DOMAIN_PLATFORM (__IOMMU_DOMAIN_PLATFORM) +#define IOMMU_DOMAIN_NESTED (__IOMMU_DOMAIN_NESTED) struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; - + const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; @@ -273,6 +277,61 @@ struct iommu_dirty_ops { struct iommu_dirty_bitmap *dirty); }; +/** + * struct iommu_user_data - iommu driver specific user space data info + * @type: The data type of the user buffer + * @uptr: Pointer to the user buffer for copy_from_user() + * @len: The length of the user buffer in bytes + * + * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h + * @type, @uptr and @len should be just copied from an iommufd core uAPI struct. + */ +struct iommu_user_data { + unsigned int type; + void __user *uptr; + size_t len; +}; + +/** + * __iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @dst_data. Must match with @src_data.type + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user( + void *dst_data, const struct iommu_user_data *src_data, + unsigned int data_type, size_t data_len, size_t min_len) +{ + if (src_data->type != data_type) + return -EINVAL; + if (WARN_ON(!dst_data || !src_data)) + return -EINVAL; + if (src_data->len < min_len || data_len < src_data->len) + return -EINVAL; + return copy_struct_from_user(dst_data, data_len, src_data->uptr, + src_data->len); +} + +/** + * iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @kdst. Must match with @user_data->type + * @min_last: The last memember of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \ + __iommu_copy_struct_from_user(kdst, user_data, data_type, \ + sizeof(*kdst), \ + offsetofend(typeof(*kdst), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability @@ -287,9 +346,13 @@ struct iommu_dirty_ops { * parameters as defined in include/uapi/linux/iommufd.h. * Unlike @domain_alloc, it is called only by IOMMUFD and * must fully initialize the new domain before return. - * Upon success, a domain is returned. Upon failure, - * ERR_PTR must be returned. - * @domain_alloc_paging: Allocate an iommu_domain that can be used for + * Upon success, if the @user_data is valid and the @parent + * points to a kernel-managed domain, the new domain must be + * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be + * NULL while the @user_data can be optionally provided, the + * new domain must support __IOMMU_DOMAIN_PAGING. + * Upon failure, ERR_PTR must be returned. +* @domain_alloc_paging: Allocate an iommu_domain that can be used for * UNMANAGED, DMA, and DMA_FQ domain types. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling @@ -327,7 +390,9 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); - struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags); + struct iommu_domain *(*domain_alloc_user)( + struct device *dev, u32 flags, struct iommu_domain *parent, + const struct iommu_user_data *user_data); struct iommu_domain *(*domain_alloc_paging)(struct device *dev); struct iommu_device *(*probe_device)(struct device *dev); @@ -357,7 +422,6 @@ struct iommu_ops { struct module *owner; struct iommu_domain *identity_domain; struct iommu_domain *blocked_domain; - struct iommu_domain *release_domain; struct iommu_domain *default_domain; CK_KABI_RESERVE(1) @@ -386,10 +450,8 @@ struct iommu_ops { * * ENODEV - device specific errors, not able to be attached * * - treated as ENODEV by the caller. Use is discouraged * @set_dev_pasid: set an iommu domain to a pasid of device - * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. - * @unmap: unmap a physically contiguous memory region from an iommu domain * @unmap_pages: unmap a number of pages of the same size from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain * @iotlb_sync_map: Sync mappings created recently using @map to the hardware @@ -408,20 +470,16 @@ struct iommu_domain_ops { int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); - int (*map)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp); int (*map_pages)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped); - size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *iotlb_gather); size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova, size_t pgsize, size_t pgcount, struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); - void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova, - size_t size); + int (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova, + size_t size); void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); @@ -442,6 +500,7 @@ struct iommu_domain_ops { * @list: Used by the iommu-core to keep a list of registered iommus * @ops: iommu-ops for talking to this iommu * @dev: struct device for sysfs handling + * @singleton_group: Used internally for drivers that have only one group * @max_pasids: number of supported PASIDs */ struct iommu_device { @@ -449,6 +508,7 @@ struct iommu_device { const struct iommu_ops *ops; struct fwnode_handle *fwnode; struct device *dev; + struct iommu_group *singleton_group; u32 max_pasids; }; @@ -492,6 +552,7 @@ struct iommu_fault_param { * @attach_deferred: the dma domain attachment is deferred * @pci_32bit_workaround: Limit DMA allocations to 32-bit IOVAs * @require_direct: device requires IOMMU_RESV_DIRECT regions + * @shadow_on_flush: IOTLB flushes are used to sync shadow tables * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. * struct iommu_group *iommu_group; @@ -507,6 +568,7 @@ struct dev_iommu { u32 attach_deferred:1; u32 pci_32bit_workaround:1; u32 require_direct:1; + u32 shadow_on_flush:1; }; int iommu_device_register(struct iommu_device *iommu, @@ -765,6 +827,7 @@ extern struct iommu_group *pci_device_group(struct device *dev); extern struct iommu_group *generic_device_group(struct device *dev); /* FSL-MC device grouping function */ struct iommu_group *fsl_mc_device_group(struct device *dev); +extern struct iommu_group *generic_single_device_group(struct device *dev); /** * struct iommu_fwspec - per-device IOMMU instance data @@ -1248,7 +1311,7 @@ static inline void iommu_free_global_pasid(ioasid_t pasid) {} * Creates a mapping at @iova for the buffer described by a scatterlist * stored in the given sg_table object in the provided IOMMU domain. */ -static inline size_t iommu_map_sgtable(struct iommu_domain *domain, +static inline ssize_t iommu_map_sgtable(struct iommu_domain *domain, unsigned long iova, struct sg_table *sgt, int prot) { return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c44eecf5d318e520a3c78c69e60b2d55a4d1618b..0b2bc6252e2ca2840b556ee6dd858ae123f22c9b 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -361,20 +361,74 @@ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE, + IOMMU_HWPT_DATA_VTD_S1, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given parent HWPT via + * @pt_id, in which the parent HWPT must be allocated previously via the + * same ioctl from a given IOAS (@pt_id). In this case, the @data_type + * must be set to a pre-defined type corresponding to an I/O page table + * type supported by the underlying IOMMU hardware. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -383,13 +437,26 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + /** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec diff --git a/kernel/dma/phytium/pswiotlb-iommu.c b/kernel/dma/phytium/pswiotlb-iommu.c index 07565ac22ed3b57ad1d51244e7a6cba5d7d021bc..0dcde77249dd564e2c214c5294379bedd22f0a32 100644 --- a/kernel/dma/phytium/pswiotlb-iommu.c +++ b/kernel/dma/phytium/pswiotlb-iommu.c @@ -244,20 +244,16 @@ static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova, { const struct iommu_domain_ops *ops = domain->ops; size_t pgsize, count; - int ret; + int ret = -EINVAL; pgsize = iommu_pgsize(domain, iova, paddr, size, &count); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", iova, &paddr, pgsize, count); - if (ops->map_pages) { + if (ops->map_pages) ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, gfp, mapped); - } else { - ret = ops->map(domain, iova, paddr, pgsize, prot, gfp); - *mapped = ret ? 0 : pgsize; - } return ret; } @@ -272,8 +268,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t orig_paddr = paddr; int ret = 0; - if (unlikely(!(ops->map || ops->map_pages) || - domain->pgsize_bitmap == 0UL)) + if (unlikely(!ops->map_pages || domain->pgsize_bitmap == 0UL)) return -ENODEV; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index ae2101c6d6403ce6a49e9df59ea03a5330ba7ca5..32203f54dee8c87cdfc25e2cc4a856c68ac6b0a9 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -264,6 +264,121 @@ TEST_F(iommufd_ioas, ioas_destroy) } } +TEST_F(iommufd_ioas, alloc_hwpt_nested) +{ + const uint32_t min_data_len = + offsetofend(struct iommu_hwpt_selftest, iotlb); + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + uint32_t nested_hwpt_id[2] = {}; + uint32_t parent_hwpt_id = 0; + uint32_t parent_hwpt_id_not_work = 0; + uint32_t test_hwpt_id = 0; + + if (self->device_id) { + /* Negative tests */ + test_err_hwpt_alloc(ENOENT, self->ioas_id, self->device_id, 0, + &test_hwpt_id); + test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0, + &test_hwpt_id); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + &parent_hwpt_id); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &parent_hwpt_id_not_work); + + /* Negative nested tests */ + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_NONE, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(EOPNOTSUPP, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST + 1, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + min_data_len - 1); + test_err_hwpt_alloc_nested(EFAULT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, NULL, + sizeof(data)); + test_err_hwpt_alloc_nested( + EOPNOTSUPP, self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id_not_work, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Allocate two nested hwpts sharing one common parent hwpt */ + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Negative test: a nested hwpt on top of a nested hwpt */ + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + nested_hwpt_id[0], 0, &test_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + /* Negative test: parent hwpt now cannot be freed */ + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, parent_hwpt_id)); + + /* Attach device to nested_hwpt_id[0] that then will be busy */ + test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, nested_hwpt_id[0])); + + /* Switch from nested_hwpt_id[0] to nested_hwpt_id[1] */ + test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[1]); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, nested_hwpt_id[1])); + test_ioctl_destroy(nested_hwpt_id[0]); + + /* Detach from nested_hwpt_id[1] and destroy it */ + test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id); + test_ioctl_destroy(nested_hwpt_id[1]); + + /* Detach from the parent hw_pagetable and destroy it */ + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); + test_ioctl_destroy(parent_hwpt_id); + test_ioctl_destroy(parent_hwpt_id_not_work); + } else { + test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0, + &parent_hwpt_id); + test_err_hwpt_alloc_nested(ENOENT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(ENOENT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_err_mock_domain_replace(ENOENT, self->stdev_id, + nested_hwpt_id[0]); + test_err_mock_domain_replace(ENOENT, self->stdev_id, + nested_hwpt_id[1]); + } +} + TEST_F(iommufd_ioas, hwpt_attach) { /* Create a device attached directly to a hwpt */ @@ -1959,7 +2074,7 @@ TEST_F(vfio_compat_mock_domain, map) ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); - /* UNMAP_FLAG_ALL requres 0 iova/size */ + /* UNMAP_FLAG_ALL requires 0 iova/size */ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL; EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 1fcd69cb0e416718c85657f5278e75824a0d7da4..f590417cd67a95bf31065f9b7682bc7c627ebf3a 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -105,7 +105,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, /* * This is just an arbitrary limit based on the current kernel - * situation. Changes in the kernel can dramtically change the number of + * situation. Changes in the kernel can dramatically change the number of * required fault injection sites, so if this hits it doesn't * necessarily mean a test failure, just that the limit has to be made * bigger. @@ -615,7 +615,8 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id)) + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id, + IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 70d558e0f0c747ea887d33805c2e7ff78fc6b8fb..ad9202335656cc82e8475cf74aba72b6adf7e2b0 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -154,13 +154,17 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, pt_id, NULL)) static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, - __u32 flags, __u32 *hwpt_id) + __u32 flags, __u32 *hwpt_id, __u32 data_type, + void *data, size_t data_len) { struct iommu_hwpt_alloc cmd = { .size = sizeof(cmd), .flags = flags, .dev_id = device_id, .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uint64_t)data, }; int ret; @@ -172,12 +176,24 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, return 0; } -#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ - ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, \ - pt_id, flags, hwpt_id)) -#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ - EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc(self->fd, device_id, \ - pt_id, flags, hwpt_id)) +#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ + 0)) +#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, flags, \ + hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0)) + +#define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id, \ + data_type, data, data_len) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, data_type, data, data_len)) +#define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \ + data_type, data, data_len) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, data_type, data, data_len)) static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id)